From 507850b9b6878dcc82f637325461838dc15cb391 Mon Sep 17 00:00:00 2001 From: yuanyuhao Date: Wed, 20 May 2026 17:39:06 +0800 Subject: [PATCH] add docs for aggregation function datasketches_hll_union_agg --- .../datasketches_hll_union_agg.md | 95 +++++++++++++++++++ .../datasketches_hll_union_agg.md | 94 ++++++++++++++++++ sidebars.ts | 1 + 3 files changed, 190 insertions(+) create mode 100644 docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md diff --git a/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md new file mode 100644 index 0000000000000..a3c1b99f8fab0 --- /dev/null +++ b/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -0,0 +1,95 @@ +--- +{ + "title": "DATASKETCHES_HLL_UNION_AGG", + "language": "en", + "description": "The datasketches_hll_union_agg function is an aggregate function used to union multiple Apache DataSketches HLL sketches and return the estimated cardinality of the union." +} +--- + +## Description + +`datasketches_hll_union_agg` is an aggregate function used to **union** multiple Apache DataSketches **HLL** (`hll_sketch`) serialized values and return the **estimated cardinality** (approximate distinct count / NDV) after union. + +This function expects the input to be **serialized bytes of a DataSketches HLL sketch** (for example, generated by `hll_sketch.serialize_compact()` in the DataSketches library). It does not accept arbitrary strings. + +Aliases: + +- `ds_hll_union_count` +- `ds_cardinality` + +## Syntax + +```sql +datasketches_hll_union_agg() +``` + +## Parameters + +| Parameter | Description | +| -- | -- | +| `` | The serialized bytes of an Apache DataSketches HLL sketch. Supported types: STRING / VARCHAR / BINARY / VARBINARY. NULL values are ignored. Empty strings are treated as invalid input and will throw an error. | + +## Return Value + +Returns a BIGINT cardinality estimate value. +If there is no valid data in the group, returns 0. +If the input bytes cannot be deserialized as a valid DataSketches HLL sketch (including empty string), an error is thrown. + +## Example + +```sql +-- setup +CREATE TABLE test_datasketches_hll_union_agg_tbl ( + id INT, + sk STRING +) +DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ("replication_num" = "1"); + +-- The sketch bytes are inserted via Base64 decoding. +INSERT INTO test_datasketches_hll_union_agg_tbl VALUES + (1, from_base64('AgEHCAMIBwjL18IEK/L7BoYv+Q11gWYHgbxdBntl5gj8LUIK')), + (2, from_base64('AwEHCAUIAAkKAAAAIjvrBcS1nwfGGWoEyHokBO8t9wc1qTEENkcJB7hWqQxZf9QNnuSbGA==')), + (3, NULL); +``` + +```sql +SELECT datasketches_hll_union_agg(sk) FROM test_datasketches_hll_union_agg_tbl; +``` + +```text ++-------------------------------+ +| datasketches_hll_union_agg(sk) | ++-------------------------------+ +| 17 | ++-------------------------------+ +``` + +```sql +-- aliases +SELECT + datasketches_hll_union_agg(sk), + ds_hll_union_count(sk), + ds_cardinality(sk) +FROM test_datasketches_hll_union_agg_tbl; +``` + +```sql +-- empty input returns 0 +SELECT datasketches_hll_union_agg(sk) +FROM test_datasketches_hll_union_agg_tbl +WHERE sk IS NULL; +``` + +```text ++-------------------------------+ +| datasketches_hll_union_agg(sk) | ++-------------------------------+ +| 0 | ++-------------------------------+ +``` + +```sql +-- empty string is invalid and will throw +SELECT datasketches_hll_union_agg(''); +``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md new file mode 100644 index 0000000000000..4dc43635ad09c --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -0,0 +1,94 @@ +--- +{ + "title": "DATASKETCHES_HLL_UNION_AGG", + "language": "zh-CN", + "description": "datasketches_hll_union_agg 函数是一种聚合函数,用于对多个 Apache DataSketches HLL sketch 的序列化结果进行 union 合并,并返回合并后基数的估算值(近似去重数)。" +} +--- + +## 描述 + +`datasketches_hll_union_agg` 函数是一种聚合函数,用于对多个 **Apache DataSketches HLL sketch(hll_sketch)** 的序列化结果进行 **union 合并**,并返回合并后基数的**估算值**(近似去重数 / NDV)。 + +该函数的输入不是普通字符串,而是 **DataSketches HLL sketch 的序列化字节串**(例如由 DataSketches 的 `hll_sketch.serialize_compact()` 生成)。 + +别名: + +- `ds_hll_union_count` +- `ds_cardinality` + +## 语法 + +```sql +datasketches_hll_union_agg() +``` + +## 参数 + +| 参数 | 说明 | +| -- | -- | +| `` | DataSketches HLL sketch 的序列化字节串。支持类型:STRING / VARCHAR / BINARY / VARBINARY。NULL 会被忽略;空字符串属于非法输入,将报错。 | + +## 返回值 + +返回 BIGINT 类型的基数估算值。 +如果组内没有合法数据则返回 0 。 +若输入字节串无法反序列化为合法的 DataSketches HLL sketch(包括空字符串),将报错。 + +## 举例 + +```sql +-- setup +CREATE TABLE test_datasketches_hll_union_agg_tbl ( + id INT, + sk STRING +) DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ("replication_num" = "1"); + +-- 通过 from_base64() 将 Base64 文本解码为 sketch 字节串后写入 +INSERT INTO test_datasketches_hll_union_agg_tbl VALUES + (1, from_base64('AgEHCAMIBwjL18IEK/L7BoYv+Q11gWYHgbxdBntl5gj8LUIK')), + (2, from_base64('AwEHCAUIAAkKAAAAIjvrBcS1nwfGGWoEyHokBO8t9wc1qTEENkcJB7hWqQxZf9QNnuSbGA==')), + (3, NULL); +``` + +```sql +SELECT datasketches_hll_union_agg(sk) FROM test_datasketches_hll_union_agg_tbl; +``` + +```text ++-------------------------------+ +| datasketches_hll_union_agg(sk) | ++-------------------------------+ +| 17 | ++-------------------------------+ +``` + +```sql +-- 别名用法 +SELECT + datasketches_hll_union_agg(sk), + ds_hll_union_count(sk), + ds_cardinality(sk) +FROM test_datasketches_hll_union_agg_tbl; +``` + +```sql +-- 组内无合法数据返回 0 +SELECT datasketches_hll_union_agg(sk) +FROM test_datasketches_hll_union_agg_tbl +WHERE sk IS NULL; +``` + +```text ++-------------------------------+ +| datasketches_hll_union_agg(sk) | ++-------------------------------+ +| 0 | ++-------------------------------+ +``` + +```sql +-- 空字符串属于非法输入,将报错 +SELECT datasketches_hll_union_agg(''); +``` \ No newline at end of file diff --git a/sidebars.ts b/sidebars.ts index 9ee4e8fca691b..c37d871fe21a2 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -1973,6 +1973,7 @@ const sidebars: SidebarsConfig = { 'sql-manual/sql-functions/aggregate-functions/count-by-enum', 'sql-manual/sql-functions/aggregate-functions/covar', 'sql-manual/sql-functions/aggregate-functions/covar-samp', + 'sql-manual/sql-functions/aggregate-functions/datasketches-hll-union-agg', 'sql-manual/sql-functions/aggregate-functions/group-array-intersect', 'sql-manual/sql-functions/aggregate-functions/group-array-union', 'sql-manual/sql-functions/aggregate-functions/group-bit-and',