diff --git a/.vale.ini b/.vale.ini index 3956a8b4fc5..396707de9a9 100644 --- a/.vale.ini +++ b/.vale.ini @@ -1,5 +1,5 @@ StylesPath = styles -MinAlertLevel = suggestion +MinAlertLevel = warning [*.{md}] BasedOnStyles = ClickHouse diff --git a/docs/_placeholders/api/_invitations-api-reference.md b/docs/_placeholders/api/_invitations-api-reference.md index cd669fd09d2..0c7cd6aa8b7 100644 --- a/docs/_placeholders/api/_invitations-api-reference.md +++ b/docs/_placeholders/api/_invitations-api-reference.md @@ -5,5 +5,5 @@ title: Invitations ## List all invitations {#list-all-invitations} -This file is generated by `clickhouseapi.js` during the build process. If the +This file is generated by `clickhouseapi.js` during the build process. If the content needs changing please edit `clickhouseapi.js`. diff --git a/docs/_placeholders/api/_keys-api-reference.md b/docs/_placeholders/api/_keys-api-reference.md index 4d946f3d80d..d683b445ab2 100644 --- a/docs/_placeholders/api/_keys-api-reference.md +++ b/docs/_placeholders/api/_keys-api-reference.md @@ -5,5 +5,5 @@ title: Keys ## Get list of all keys {#get-list-of-all-keys} -This file is generated by `clickhouseapi.js` during the build process. If the +This file is generated by `clickhouseapi.js` during the build process. If the content needs changing please edit `clickhouseapi.js`. diff --git a/docs/_placeholders/api/_members-api-reference.md b/docs/_placeholders/api/_members-api-reference.md index 01970e1589b..14a49e345c2 100644 --- a/docs/_placeholders/api/_members-api-reference.md +++ b/docs/_placeholders/api/_members-api-reference.md @@ -5,5 +5,5 @@ title: Members ## List organization members {#list-organization-members} -This file is generated by `clickhouseapi.js` during the build process. If the +This file is generated by `clickhouseapi.js` during the build process. If the content needs changing please edit `clickhouseapi.js`. diff --git a/docs/_placeholders/api/_organizations-api-reference.md b/docs/_placeholders/api/_organizations-api-reference.md index 1c4fab9b6db..10acc0dada6 100644 --- a/docs/_placeholders/api/_organizations-api-reference.md +++ b/docs/_placeholders/api/_organizations-api-reference.md @@ -5,5 +5,5 @@ title: Organizations ## Get organization details {#get-organization-details} -This file is generated by `clickhouseapi.js` during the build process. If the +This file is generated by `clickhouseapi.js` during the build process. If the content needs changing please edit `clickhouseapi.js`. diff --git a/docs/_placeholders/api/_services-api-reference.md b/docs/_placeholders/api/_services-api-reference.md index 51405b7a4c9..b3c44de453a 100644 --- a/docs/_placeholders/api/_services-api-reference.md +++ b/docs/_placeholders/api/_services-api-reference.md @@ -5,5 +5,5 @@ title: Services ## List of organization services {#list-of-organization-services} -This file is generated by `clickhouseapi.js` during the build process. If the +This file is generated by `clickhouseapi.js` during the build process. If the content needs changing please edit `clickhouseapi.js`. diff --git a/docs/_snippets/_GCS_authentication_and_bucket.md b/docs/_snippets/_GCS_authentication_and_bucket.md index 546666a8049..6625272e43a 100644 --- a/docs/_snippets/_GCS_authentication_and_bucket.md +++ b/docs/_snippets/_GCS_authentication_and_bucket.md @@ -9,44 +9,24 @@ import GCS_guide_key from '@site/static/images/integrations/data-ingestion/s3/GC import Image from '@theme/IdealImage';
- Create GCS buckets and an HMAC key - +Create GCS buckets and an HMAC key ### ch_bucket_us_east1 {#ch_bucket_us_east1} - Creating a GCS bucket in US East 1 - ### ch_bucket_us_east4 {#ch_bucket_us_east4} - Creating a GCS bucket in US East 4 - ### Generate an access key {#generate-an-access-key} - ### Create a service account HMAC key and secret {#create-a-service-account-hmac-key-and-secret} - Open **Cloud Storage > Settings > Interoperability** and either choose an existing **Access key**, or **CREATE A KEY FOR A SERVICE ACCOUNT**. This guide covers the path for creating a new key for a new service account. - Generating a service account HMAC key in GCS - ### Add a new service account {#add-a-new-service-account} - If this is a project with no existing service account, **CREATE NEW ACCOUNT**. - Adding a new service account in GCS - There are three steps to creating the service account, in the first step give the account a meaningful name, ID, and description. - Defining a new service account name and ID in GCS - In the Interoperability settings dialog the IAM role **Storage Object Admin** role is recommended; select that role in step two. - Selecting IAM role Storage Object Admin in GCS - Step three is optional and not used in this guide. You may allow users to have these privileges based on your policies. - Configuring additional settings for the new service account in GCS - The service account HMAC key will be displayed. Save this information, as it will be used in the ClickHouse configuration. - Retrieving the generated HMAC key for GCS -
diff --git a/docs/_snippets/_S3_authentication_and_bucket.md b/docs/_snippets/_S3_authentication_and_bucket.md index 1cf34667582..a72c6521f71 100644 --- a/docs/_snippets/_S3_authentication_and_bucket.md +++ b/docs/_snippets/_S3_authentication_and_bucket.md @@ -18,118 +18,78 @@ import s3_g from '@site/static/images/_snippets/s3/s3-g.png'; import s3_h from '@site/static/images/_snippets/s3/s3-h.png';
- Create S3 buckets and an IAM user - +Create S3 buckets and an IAM user This article demonstrates the basics of how to configure an AWS IAM user, create an S3 bucket and configure ClickHouse to use the bucket as an S3 disk. You should work with your security team to determine the permissions to be used, and consider these as a starting point. - ### Create an AWS IAM user {#create-an-aws-iam-user} In this procedure, we'll be creating a service account user, not a login user. -1. Log into the AWS IAM Management Console. - +1. Log into the AWS IAM Management Console. 2. In "users", select **Add users** - AWS IAM Management Console - Adding a new user - 3. Enter the user name and set the credential type to **Access key - Programmatic access** and select **Next: Permissions** - Setting user name and access type for IAM user - 4. Do not add the user to any group; select **Next: Tags** - Skipping group assignment for IAM user - 5. Unless you need to add any tags, select **Next: Review** - Skipping tag assignment for IAM user - 6. Select **Create User** - - :::note - The warning message stating that the user has no permissions can be ignored; permissions will be granted on the bucket for the user in the next section - ::: - +:::note +The warning message stating that the user has no permissions can be ignored; permissions will be granted on the bucket for the user in the next section +::: Creating the IAM user with no permissions warning - 7. The user is now created; click on **show** and copy the access and secret keys. :::note Save the keys somewhere else; this is the only time that the secret access key will be available. ::: - Viewing and copying the IAM user access keys - 8. Click close, then find the user in the users screen. - Finding the newly created IAM user in the users list - 9. Copy the ARN (Amazon Resource Name) and save it for use when configuring the access policy for the bucket. - Copying the ARN of the IAM user - ### Create an S3 bucket {#create-an-s3-bucket} 1. In the S3 bucket section, select **Create bucket** - Starting the S3 bucket creation process - 2. Enter a bucket name, leave other options default :::note The bucket name must be unique across AWS, not just the organization, or it will emit an error. ::: 3. Leave `Block all Public Access` enabled; public access is not needed. - Configuring the S3 bucket settings with public access blocked - 4. Select **Create Bucket** at the bottom of the page - Finalizing S3 bucket creation - 5. Select the link, copy the ARN, and save it for use when configuring the access policy for the bucket. - 6. Once the bucket has been created, find the new S3 bucket in the S3 buckets list and select the link - Finding the newly created S3 bucket in the buckets list - 7. Select **Create folder** - Creating a new folder in the S3 bucket - 8. Enter a folder name that will be the target for the ClickHouse S3 disk and select **Create folder** - Setting the folder name for ClickHouse S3 disk usage - 9. The folder should now be visible on the bucket list - Viewing the newly created folder in the S3 bucket - 10. Select the checkbox for the new folder and click on **Copy URL** Save the URL copied to be used in the ClickHouse storage configuration in the next section. - Copying the S3 folder URL for ClickHouse configuration - 11. Select the **Permissions** tab and click on the **Edit** button in the **Bucket Policy** section - Accessing the S3 bucket policy configuration - 12. Add a bucket policy, example below: ```json { - "Version" : "2012-10-17", - "Id" : "Policy123456", - "Statement" : [ - { - "Sid" : "abc123", - "Effect" : "Allow", - "Principal" : { - "AWS" : "arn:aws:iam::921234567898:user/mars-s3-user" - }, - "Action" : "s3:*", - "Resource" : [ - "arn:aws:s3:::mars-doc-test", - "arn:aws:s3:::mars-doc-test/*" - ] - } - ] +"Version" : "2012-10-17", +"Id" : "Policy123456", +"Statement" : [ +{ +"Sid" : "abc123", +"Effect" : "Allow", +"Principal" : { +"AWS" : "arn:aws:iam::921234567898:user/mars-s3-user" +}, +"Action" : "s3:*", +"Resource" : [ +"arn:aws:s3:::mars-doc-test", +"arn:aws:s3:::mars-doc-test/*" +] +} +] } ``` - ```response |Parameter | Description | Example Value | |----------|-------------|----------------| @@ -140,13 +100,10 @@ The bucket name must be unique across AWS, not just the organization, or it will |Action | What operations are allowed on the bucket| s3:*| |Resource | Which resources in the bucket will operations be allowed in | "arn:aws:s3:::mars-doc-test", "arn:aws:s3:::mars-doc-test/*" | ``` - :::note You should work with your security team to determine the permissions to be used, consider these as a starting point. For more information on Policies and settings, refer to AWS documentation: https://docs.aws.amazon.com/AmazonS3/latest/userguide/access-policy-language-overview.html ::: - 13. Save the policy configuration. -
diff --git a/docs/_snippets/_add_remote_ip_access_list_detail.md b/docs/_snippets/_add_remote_ip_access_list_detail.md index fccdb2467ee..ab4ae2ec2eb 100644 --- a/docs/_snippets/_add_remote_ip_access_list_detail.md +++ b/docs/_snippets/_add_remote_ip_access_list_detail.md @@ -3,14 +3,9 @@ import ip_allow_list_check_list from '@site/static/images/_snippets/ip-allow-lis import ip_allow_list_add_current_ip from '@site/static/images/_snippets/ip-allow-list-add-current-ip.png';
- Manage your IP Access List - +Manage your IP Access List From your ClickHouse Cloud services list choose the service that you will work with and switch to **Settings**. If the IP Access List does not contain the IP Address or range of the remote system that needs to connect to your ClickHouse Cloud service, then you can resolve the problem with **Add IPs**: - Check to see if the service allows traffic from your IP address in the IP Access List - Add the individual IP Address, or the range of addresses that need to connect to your ClickHouse Cloud service. Modify the form as you see fit and then **Save**. - Add your current IP address to the IP Access List in ClickHouse Cloud -
diff --git a/docs/_snippets/_add_superset_detail.md b/docs/_snippets/_add_superset_detail.md index 88e64ec9be1..4c549a8fea7 100644 --- a/docs/_snippets/_add_superset_detail.md +++ b/docs/_snippets/_add_superset_detail.md @@ -1,47 +1,30 @@
- Launch Apache Superset in Docker - +Launch Apache Superset in Docker Superset provides [installing Superset locally using Docker Compose](https://superset.apache.org/docs/installation/installing-superset-using-docker-compose/) instructions. After checking out the Apache Superset repo from GitHub you can run the latest development code, or a specific tag. We recommend release 2.0.0 as it is the latest release not marked as `pre-release`. - There are a few tasks to be done before running `docker compose`: - 1. Add the official ClickHouse Connect driver 2. Obtain a Mapbox API key and add that as an environment variable (optional) 3. Specify the version of Superset to run - :::tip The commands below are to be run from the top level of the GitHub repo, `superset`. ::: - ## Official ClickHouse connect driver {#official-clickhouse-connect-driver} - To make the ClickHouse Connect driver available in the Superset deployment add it to the local requirements file: - ```bash echo "clickhouse-connect" >> ./docker/requirements-local.txt ``` - ## Mapbox {#mapbox} - This is optional, you can plot location data in Superset without a Mapbox API key, but you will see a message telling you that you should add a key and the background image of the map will be missing (you will only see the data points and not the map background). Mapbox provides a free tier if you would like to use it. - Some of the sample visualizations that the guides have you create use location, for example longitude and latitude, data. Superset includes support for Mapbox maps. To use the Mapbox visualizations you need a Mapbox API key. Sign up for the [Mapbox free tier](https://account.mapbox.com/auth/signup/), and generate an API key. - Make the API key available to Superset: - ```bash echo "MAPBOX_API_KEY=pk.SAMPLE-Use-your-key-instead" >> docker/.env-non-dev ``` - ## Deploy Superset version 2.0.0 {#deploy-superset-version-200} - To deploy release 2.0.0 run: - ```bash git checkout 2.0.0 TAG=2.0.0 docker-compose -f docker-compose-non-dev.yml pull TAG=2.0.0 docker-compose -f docker-compose-non-dev.yml up ``` -
- diff --git a/docs/_snippets/_aws_regions.md b/docs/_snippets/_aws_regions.md index 9f7d2e603a8..5d45c4c363c 100644 --- a/docs/_snippets/_aws_regions.md +++ b/docs/_snippets/_aws_regions.md @@ -9,7 +9,3 @@ |us-east-1 c1 | com.amazonaws.vpce.us-east-1.vpce-svc-096c118db1ff20ea4 | use1-az6 use1-az4 use1-az2 | |us-east-2 | com.amazonaws.vpce.us-east-2.vpce-svc-0b99748bf269a86b4 | use2-az1 use2-az2 use2-az3 | |us-west-2 | com.amazonaws.vpce.us-west-2.vpce-svc-049bbd33f61271781 | usw2-az2 usw2-az1 usw2-az3 | - - - - diff --git a/docs/_snippets/_clickhouse_mysql_cloud_setup.mdx b/docs/_snippets/_clickhouse_mysql_cloud_setup.mdx index 91daf6f1f08..07e11c13d9e 100644 --- a/docs/_snippets/_clickhouse_mysql_cloud_setup.mdx +++ b/docs/_snippets/_clickhouse_mysql_cloud_setup.mdx @@ -7,34 +7,26 @@ import Image from '@theme/IdealImage';
1. After creating your ClickHouse Cloud Service, on the `Connect your app` screen, select MySQL from the drop down. -
- -ClickHouse Cloud credentials screen showing MySQL interface selection dropdown + ClickHouse Cloud credentials screen showing MySQL interface selection dropdown 2. Toggle the switch to enable the MySQL interface for this specific service. This will expose port `3306` for this service and prompt you with your MySQL connection screen that include your unique MySQL username. -ClickHouse Cloud MySQL interface enabling toggle and connection details -
+ ClickHouse Cloud MySQL interface enabling toggle and connection details -Alternatively, in order to enable the MySQL interface for an existing service: + Alternatively, in order to enable the MySQL interface for an existing service: 3. Ensure your service is in `Running` state then click on the service you want to enable the MySQL interface for. Select "Connect" from the left menu: -
-ClickHouse Cloud service connection screen with Connect option highlighted -
- + ClickHouse Cloud service connection screen with Connect option highlighted 4. Select MySQL from the `Connect With` drop down. -
-ClickHouse Cloud connection screen showing MySQL option selection -
+ ClickHouse Cloud connection screen showing MySQL option selection 5. Toggle the switch to enable the MySQL interface for this specific service. This will expose port `3306` for this service and prompt you with your MySQL connection screen that include your unique MySQL username. -ClickHouse Cloud connection screen with MySQL interface enabled showing connection details + ClickHouse Cloud connection screen with MySQL interface enabled showing connection details ## Creating multiple MySQL users in ClickHouse Cloud {#creating-multiple-mysql-users-in-clickhouse-cloud} diff --git a/docs/_snippets/_clickhouse_mysql_on_premise_setup.mdx b/docs/_snippets/_clickhouse_mysql_on_premise_setup.mdx index cc9f0f5bf7e..29cef256f5a 100644 --- a/docs/_snippets/_clickhouse_mysql_on_premise_setup.mdx +++ b/docs/_snippets/_clickhouse_mysql_on_premise_setup.mdx @@ -89,5 +89,5 @@ Read 4 rows, 603.00 B in 0.00156 sec., 2564 rows/sec., 377.48 KiB/sec. Finally, configure the Clickhouse Server to listen on the desired IP address(es). For example, in `config.xml`, uncomment out the following to listen on all addresses: ```bash -:: +:: ``` diff --git a/docs/_snippets/_config-files.md b/docs/_snippets/_config-files.md index a7e813136e4..d50480d69d2 100644 --- a/docs/_snippets/_config-files.md +++ b/docs/_snippets/_config-files.md @@ -3,5 +3,5 @@ When configuring ClickHouse Server by adding or editing configuration files you - Add files to `/etc/clickhouse-server/config.d/` directory - Add files to `/etc/clickhouse-server/users.d/` directory - Leave the `/etc/clickhouse-server/config.xml` file as it is -- Leave the `/etc/clickhouse-server/users.xml` file as it is -::: +- Leave the `/etc/clickhouse-server/users.xml` file as it is + ::: diff --git a/docs/_snippets/_gather_your_details_http.mdx b/docs/_snippets/_gather_your_details_http.mdx index 3415ec3119c..7ee962cfac6 100644 --- a/docs/_snippets/_gather_your_details_http.mdx +++ b/docs/_snippets/_gather_your_details_http.mdx @@ -10,12 +10,12 @@ To connect to ClickHouse with HTTP(S) you need this information: - The USERNAME and PASSWORD: out of the box, the username is `default`. Use the username appropriate for your use case. -The details for your ClickHouse Cloud service are available in the ClickHouse Cloud console. Select the service that you will connect to and click **Connect**: + The details for your ClickHouse Cloud service are available in the ClickHouse Cloud console. Select the service that you will connect to and click **Connect**: -ClickHouse Cloud service connect button + ClickHouse Cloud service connect button -Choose **HTTPS**, and the details are available in an example `curl` command. + Choose **HTTPS**, and the details are available in an example `curl` command. -ClickHouse Cloud HTTPS connection details + ClickHouse Cloud HTTPS connection details -If you are using self-managed ClickHouse, the connection details are set by your ClickHouse administrator. + If you are using self-managed ClickHouse, the connection details are set by your ClickHouse administrator. diff --git a/docs/_snippets/_gather_your_details_native.md b/docs/_snippets/_gather_your_details_native.md index c69cfac7a24..e98835f5459 100644 --- a/docs/_snippets/_gather_your_details_native.md +++ b/docs/_snippets/_gather_your_details_native.md @@ -2,7 +2,6 @@ import cloud_connect_button from '@site/static/images/_snippets/cloud-connect-bu import connection_details_native from '@site/static/images/_snippets/connection-details-native.png'; import Image from '@theme/IdealImage'; - To connect to ClickHouse with native TCP you need this information: - The HOST and PORT: typically, the port is 9440 when using TLS, or 9000 when not using TLS. @@ -11,12 +10,12 @@ To connect to ClickHouse with native TCP you need this information: - The USERNAME and PASSWORD: out of the box the username is `default`. Use the username appropriate for your use case. -The details for your ClickHouse Cloud service are available in the ClickHouse Cloud console. Select the service that you will connect to and click **Connect**: + The details for your ClickHouse Cloud service are available in the ClickHouse Cloud console. Select the service that you will connect to and click **Connect**: -ClickHouse Cloud service connect button + ClickHouse Cloud service connect button -Choose **Native**, and the details are available in an example `clickhouse-client` command. + Choose **Native**, and the details are available in an example `clickhouse-client` command. -ClickHouse Cloud Native TCP connection details + ClickHouse Cloud Native TCP connection details -If you are using self-managed ClickHouse, the connection details are set by your ClickHouse administrator. + If you are using self-managed ClickHouse, the connection details are set by your ClickHouse administrator. diff --git a/docs/_snippets/_gcp_regions.md b/docs/_snippets/_gcp_regions.md index 2b1311ffdd8..786b76e8b2b 100644 --- a/docs/_snippets/_gcp_regions.md +++ b/docs/_snippets/_gcp_regions.md @@ -4,6 +4,3 @@ |`europe-west4`| `projects/dataplane-production/regions/europe-west4/serviceAttachments/production-europe-west4-clickhouse-cloud`| `europe-west4.p.gcp.clickhouse.cloud` | |`us-central1`| `projects/dataplane-production/regions/us-central1/serviceAttachments/production-us-central1-clickhouse-cloud` | `us-central1.p.gcp.clickhouse.cloud` | |`us-east1`| `projects/dataplane-production/regions/us-east1/serviceAttachments/production-us-east1-clickhouse-cloud` | `us-east1.p.gcp.clickhouse.cloud` | - - - diff --git a/docs/_snippets/_keeper-config-files.md b/docs/_snippets/_keeper-config-files.md index bbb9c3e0828..4ccac552f37 100644 --- a/docs/_snippets/_keeper-config-files.md +++ b/docs/_snippets/_keeper-config-files.md @@ -1,5 +1,5 @@ :::important best practices When configuring ClickHouse Keeper by editing configuration files you should: -- Backup the `/etc/clickhouse-keeper/keeper_config.xml` +- Backup the `/etc/clickhouse-keeper/keeper_config.xml` - Edit the `/etc/clickhouse-keeper/keeper_config.xml` file -::: + ::: diff --git a/docs/_snippets/_launch_sql_console.md b/docs/_snippets/_launch_sql_console.md index 334ca310a88..5de3c6113e4 100644 --- a/docs/_snippets/_launch_sql_console.md +++ b/docs/_snippets/_launch_sql_console.md @@ -2,20 +2,14 @@ import cloud_connect_to_sql_console from '@site/static/images/_snippets/cloud-co import createservice8 from '@site/static/images/_snippets/createservice8.png'; import Image from '@theme/IdealImage'; - :::tip SQL console If you need a SQL client connection, your ClickHouse Cloud service has an associated web based SQL console; expand **Connect to SQL console** below for details. :::
- Connect to SQL console - +Connect to SQL console From your ClickHouse Cloud services list, click on a service. - Connect to SQL Console - This will redirect you to the SQL console. - SQL Console -
diff --git a/docs/_snippets/_replication-sharding-terminology.md b/docs/_snippets/_replication-sharding-terminology.md index e463d9b6815..bfb4432053d 100644 --- a/docs/_snippets/_replication-sharding-terminology.md +++ b/docs/_snippets/_replication-sharding-terminology.md @@ -1,6 +1,6 @@ ## Terminology {#terminology} ### Replica {#replica} -A copy of data. ClickHouse always has at least one copy of your data, and so the minimum number of **replicas** is one. This is an important detail, you may not be used to counting the original copy of your data as a replica, but that is the term used in ClickHouse code and documentation. Adding a second replica of your data provides fault tolerance. +A copy of data. ClickHouse always has at least one copy of your data, and so the minimum number of **replicas** is one. This is an important detail, you may not be used to counting the original copy of your data as a replica, but that is the term used in ClickHouse code and documentation. Adding a second replica of your data provides fault tolerance. ### Shard {#shard} A subset of data. ClickHouse always has at least one shard for your data, so if you do not split the data across multiple servers, your data will be stored in one shard. Sharding data across multiple servers can be used to divide the load if you exceed the capacity of a single server. The destination server is determined by the **sharding key**, and is defined when you create the distributed table. The sharding key can be random or as an output of a [hash function](/sql-reference/functions/hash-functions). The deployment examples involving sharding will use `rand()` as the sharding key, and will provide further information on when and how to choose a different sharding key. diff --git a/docs/_snippets/_tabs.md b/docs/_snippets/_tabs.md index 25cbabd8dcd..f5af720af04 100644 --- a/docs/_snippets/_tabs.md +++ b/docs/_snippets/_tabs.md @@ -10,15 +10,9 @@ import CodeBlock from '@theme/CodeBlock'; - Cloud - - Self-managed - - - diff --git a/docs/_snippets/_users-and-roles-common.md b/docs/_snippets/_users-and-roles-common.md index 29726229be9..c47da17defc 100644 --- a/docs/_snippets/_users-and-roles-common.md +++ b/docs/_snippets/_users-and-roles-common.md @@ -44,62 +44,62 @@ Create these tables and users to be used in the examples. 1. Create a test database - ```sql - CREATE DATABASE db1; - ``` + ```sql + CREATE DATABASE db1; + ``` 2. Create a table - ```sql - CREATE TABLE db1.table1 ( + ```sql + CREATE TABLE db1.table1 ( id UInt64, column1 String, column2 String - ) - ENGINE MergeTree - ORDER BY id; - ``` + ) + ENGINE MergeTree + ORDER BY id; + ``` 3. Populate the table with sample rows - ```sql - INSERT INTO db1.table1 + ```sql + INSERT INTO db1.table1 (id, column1, column2) - VALUES + VALUES (1, 'A', 'abc'), (2, 'A', 'def'), (3, 'B', 'abc'), (4, 'B', 'def'); - ``` + ``` 4. Verify the table: - ```sql - SELECT * - FROM db1.table1 - ``` + ```sql + SELECT * + FROM db1.table1 + ``` - ```response - Query id: 475015cc-6f51-4b20-bda2-3c9c41404e49 + ```response + Query id: 475015cc-6f51-4b20-bda2-3c9c41404e49 - ┌─id─┬─column1─┬─column2─┐ - │ 1 │ A │ abc │ - │ 2 │ A │ def │ - │ 3 │ B │ abc │ - │ 4 │ B │ def │ - └────┴─────────┴─────────┘ - ``` + ┌─id─┬─column1─┬─column2─┐ + │ 1 │ A │ abc │ + │ 2 │ A │ def │ + │ 3 │ B │ abc │ + │ 4 │ B │ def │ + └────┴─────────┴─────────┘ + ``` 5. Create a regular user that will be used to demonstrate restrict access to certain columns: - ```sql - CREATE USER column_user IDENTIFIED BY 'password'; - ``` + ```sql + CREATE USER column_user IDENTIFIED BY 'password'; + ``` 6. Create a regular user that will be used to demonstrate restricting access to rows with certain values: - ```sql - CREATE USER row_user IDENTIFIED BY 'password'; - ``` + ```sql + CREATE USER row_user IDENTIFIED BY 'password'; + ``` #### Creating roles {#creating-roles} @@ -109,54 +109,54 @@ With this set of examples: - privileges will be granted to the roles - users will be assigned to each role -Roles are used to define groups of users for certain privileges instead of managing each user separately. + Roles are used to define groups of users for certain privileges instead of managing each user separately. -1. Create a role to restrict users of this role to only see `column1` in database `db1` and `table1`: +1. Create a role to restrict users of this role to only see `column1` in database `db1` and `table1`: ```sql CREATE ROLE column1_users; ``` -2. Set privileges to allow view on `column1` +2. Set privileges to allow view on `column1` ```sql GRANT SELECT(id, column1) ON db1.table1 TO column1_users; ``` -3. Add the `column_user` user to the `column1_users` role +3. Add the `column_user` user to the `column1_users` role ```sql GRANT column1_users TO column_user; ``` -4. Create a role to restrict users of this role to only see selected rows, in this case, only rows containing `A` in `column1` +4. Create a role to restrict users of this role to only see selected rows, in this case, only rows containing `A` in `column1` ```sql CREATE ROLE A_rows_users; ``` -5. Add the `row_user` to the `A_rows_users` role +5. Add the `row_user` to the `A_rows_users` role ```sql GRANT A_rows_users TO row_user; ``` -6. Create a policy to allow view on only where `column1` has the values of `A` +6. Create a policy to allow view on only where `column1` has the values of `A` ```sql CREATE ROW POLICY A_row_filter ON db1.table1 FOR SELECT USING column1 = 'A' TO A_rows_users; ``` -7. Set privileges to the database and table +7. Set privileges to the database and table ```sql GRANT SELECT(id, column1, column2) ON db1.table1 TO A_rows_users; ``` -8. grant explicit permissions for other roles to still have access to all rows +8. grant explicit permissions for other roles to still have access to all rows ```sql - CREATE ROW POLICY allow_other_users_filter + CREATE ROW POLICY allow_other_users_filter ON db1.table1 FOR SELECT USING 1 TO clickhouse_admin, column1_users; ``` @@ -170,104 +170,104 @@ Roles are used to define groups of users for certain privileges instead of manag 1. Log into the clickhouse client using the `clickhouse_admin` user - ```bash - clickhouse-client --user clickhouse_admin --password password - ``` + ```bash + clickhouse-client --user clickhouse_admin --password password + ``` 2. Verify access to database, table and all rows with the admin user. - ```sql - SELECT * - FROM db1.table1 - ``` + ```sql + SELECT * + FROM db1.table1 + ``` - ```response - Query id: f5e906ea-10c6-45b0-b649-36334902d31d + ```response + Query id: f5e906ea-10c6-45b0-b649-36334902d31d - ┌─id─┬─column1─┬─column2─┐ - │ 1 │ A │ abc │ - │ 2 │ A │ def │ - │ 3 │ B │ abc │ - │ 4 │ B │ def │ - └────┴─────────┴─────────┘ - ``` + ┌─id─┬─column1─┬─column2─┐ + │ 1 │ A │ abc │ + │ 2 │ A │ def │ + │ 3 │ B │ abc │ + │ 4 │ B │ def │ + └────┴─────────┴─────────┘ + ``` 3. Log into the ClickHouse client using the `column_user` user - ```bash - clickhouse-client --user column_user --password password - ``` + ```bash + clickhouse-client --user column_user --password password + ``` 4. Test `SELECT` using all columns - ```sql - SELECT * - FROM db1.table1 - ``` + ```sql + SELECT * + FROM db1.table1 + ``` - ```response - Query id: 5576f4eb-7450-435c-a2d6-d6b49b7c4a23 + ```response + Query id: 5576f4eb-7450-435c-a2d6-d6b49b7c4a23 - 0 rows in set. Elapsed: 0.006 sec. + 0 rows in set. Elapsed: 0.006 sec. - Received exception from server (version 22.3.2): - Code: 497. DB::Exception: Received from localhost:9000. - DB::Exception: column_user: Not enough privileges. - To execute this query it's necessary to have grant - SELECT(id, column1, column2) ON db1.table1. (ACCESS_DENIED) - ``` + Received exception from server (version 22.3.2): + Code: 497. DB::Exception: Received from localhost:9000. + DB::Exception: column_user: Not enough privileges. + To execute this query it's necessary to have grant + SELECT(id, column1, column2) ON db1.table1. (ACCESS_DENIED) + ``` - :::note - Access is denied since all columns were specified and the user only has access to `id` and `column1` - ::: + :::note + Access is denied since all columns were specified and the user only has access to `id` and `column1` + ::: 5. Verify `SELECT` query with only columns specified and allowed: - ```sql - SELECT + ```sql + SELECT id, column1 - FROM db1.table1 - ``` + FROM db1.table1 + ``` - ```response - Query id: cef9a083-d5ce-42ff-9678-f08dc60d4bb9 + ```response + Query id: cef9a083-d5ce-42ff-9678-f08dc60d4bb9 - ┌─id─┬─column1─┐ - │ 1 │ A │ - │ 2 │ A │ - │ 3 │ B │ - │ 4 │ B │ - └────┴─────────┘ - ``` + ┌─id─┬─column1─┐ + │ 1 │ A │ + │ 2 │ A │ + │ 3 │ B │ + │ 4 │ B │ + └────┴─────────┘ + ``` ### Testing role privileges with row restricted user {#testing-role-privileges-with-row-restricted-user} 1. Log into the ClickHouse client using `row_user` - ```bash - clickhouse-client --user row_user --password password - ``` + ```bash + clickhouse-client --user row_user --password password + ``` 2. View rows available - ```sql - SELECT * - FROM db1.table1 - ``` + ```sql + SELECT * + FROM db1.table1 + ``` - ```response - Query id: a79a113c-1eca-4c3f-be6e-d034f9a220fb + ```response + Query id: a79a113c-1eca-4c3f-be6e-d034f9a220fb - ┌─id─┬─column1─┬─column2─┐ - │ 1 │ A │ abc │ - │ 2 │ A │ def │ - └────┴─────────┴─────────┘ - ``` + ┌─id─┬─column1─┬─column2─┐ + │ 1 │ A │ abc │ + │ 2 │ A │ def │ + └────┴─────────┴─────────┘ + ``` - :::note - Verify that only the above two rows are returned, rows with the value `B` in `column1` should be excluded. - ::: + :::note + Verify that only the above two rows are returned, rows with the value `B` in `column1` should be excluded. + ::: ## Modifying users and roles {#modifying-users-and-roles} @@ -277,64 +277,64 @@ For example, if one `role1` allows for only select on `column1` and `role2` allo 1. Using the admin account, create new user to restrict by both row and column with default roles - ```sql - CREATE USER row_and_column_user IDENTIFIED BY 'password' DEFAULT ROLE A_rows_users; - ``` + ```sql + CREATE USER row_and_column_user IDENTIFIED BY 'password' DEFAULT ROLE A_rows_users; + ``` 2. Remove prior privileges for `A_rows_users` role - ```sql - REVOKE SELECT(id, column1, column2) ON db1.table1 FROM A_rows_users; - ``` + ```sql + REVOKE SELECT(id, column1, column2) ON db1.table1 FROM A_rows_users; + ``` 3. Allow `A_row_users` role to only select from `column1` - ```sql - GRANT SELECT(id, column1) ON db1.table1 TO A_rows_users; - ``` + ```sql + GRANT SELECT(id, column1) ON db1.table1 TO A_rows_users; + ``` 4. Log into the ClickHouse client using `row_and_column_user` - ```bash - clickhouse-client --user row_and_column_user --password password; - ``` + ```bash + clickhouse-client --user row_and_column_user --password password; + ``` 5. Test with all columns: - ```sql - SELECT * - FROM db1.table1 - ``` + ```sql + SELECT * + FROM db1.table1 + ``` - ```response - Query id: 8cdf0ff5-e711-4cbe-bd28-3c02e52e8bc4 + ```response + Query id: 8cdf0ff5-e711-4cbe-bd28-3c02e52e8bc4 - 0 rows in set. Elapsed: 0.005 sec. + 0 rows in set. Elapsed: 0.005 sec. - Received exception from server (version 22.3.2): - Code: 497. DB::Exception: Received from localhost:9000. - DB::Exception: row_and_column_user: Not enough privileges. - To execute this query it's necessary to have grant - SELECT(id, column1, column2) ON db1.table1. (ACCESS_DENIED) - ``` + Received exception from server (version 22.3.2): + Code: 497. DB::Exception: Received from localhost:9000. + DB::Exception: row_and_column_user: Not enough privileges. + To execute this query it's necessary to have grant + SELECT(id, column1, column2) ON db1.table1. (ACCESS_DENIED) + ``` 6. Test with limited allowed columns: - ```sql - SELECT + ```sql + SELECT id, column1 - FROM db1.table1 - ``` + FROM db1.table1 + ``` - ```response - Query id: 5e30b490-507a-49e9-9778-8159799a6ed0 + ```response + Query id: 5e30b490-507a-49e9-9778-8159799a6ed0 - ┌─id─┬─column1─┐ - │ 1 │ A │ - │ 2 │ A │ - └────┴─────────┘ - ``` + ┌─id─┬─column1─┐ + │ 1 │ A │ + │ 2 │ A │ + └────┴─────────┘ + ``` ## Troubleshooting {#troubleshooting} @@ -406,11 +406,10 @@ The following commands can be used to: - delete policies - unassign users from roles - delete users and roles -
-:::tip -Run these commands as an admin user or the `default` user -::: + :::tip + Run these commands as an admin user or the `default` user + ::: ### Remove privilege from a role {#remove-privilege-from-a-role} diff --git a/docs/about-us/adopters.md b/docs/about-us/adopters.md index 36bb9c60c09..ba1d62e63ef 100644 --- a/docs/about-us/adopters.md +++ b/docs/about-us/adopters.md @@ -559,5 +559,4 @@ The following list of companies using ClickHouse and their success stories is as | [ЦФТ](https://cft.ru/) | Banking, Financial products, Payments | — | [Meetup in Russian, April 2020](https://team.cft.ru/events/162) | — | — | | [Цифровой Рабочий](https://promo.croc.ru/digitalworker) | Industrial IoT, Analytics | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) | — | — | - diff --git a/docs/about-us/beta-and-experimental-features.md b/docs/about-us/beta-and-experimental-features.md index a38c8d9ab5b..2ebb37d0944 100644 --- a/docs/about-us/beta-and-experimental-features.md +++ b/docs/about-us/beta-and-experimental-features.md @@ -22,9 +22,9 @@ The sections below explicitly describe the properties of **Beta** and **Experime - Possibly enabled in ClickHouse Cloud - The ClickHouse team supports beta features -You can find below the features considered Beta in ClickHouse Cloud and are available for use in your ClickHouse Cloud Services. + You can find below the features considered Beta in ClickHouse Cloud and are available for use in your ClickHouse Cloud Services. -Note: please be sure to be using a current version of the ClickHouse [compatibility](/operations/settings/settings#compatibility) setting to be using a recently introduced feature. + Note: please be sure to be using a current version of the ClickHouse [compatibility](/operations/settings/settings#compatibility) setting to be using a recently introduced feature. ## Experimental features {#experimental-features} @@ -37,11 +37,11 @@ Note: please be sure to be using a current version of the ClickHouse [compatibil - May lack important functionality and documentation - Cannot be enabled in the cloud -Please note: no additional experimental features are allowed to be enabled in ClickHouse Cloud other than those listed above as Beta. + Please note: no additional experimental features are allowed to be enabled in ClickHouse Cloud other than those listed above as Beta. - + --> - - + + diff --git a/docs/about-us/cloud.md b/docs/about-us/cloud.md index 4c641112492..b74251c6dc8 100644 --- a/docs/about-us/cloud.md +++ b/docs/about-us/cloud.md @@ -8,7 +8,7 @@ title: 'ClickHouse Cloud' # ClickHouse Cloud -ClickHouse Cloud is the cloud offering created by the original creators of the popular open-source OLAP database ClickHouse. +ClickHouse Cloud is the cloud offering created by the original creators of the popular open-source OLAP database ClickHouse. You can experience ClickHouse Cloud by [starting a free trial](https://console.clickhouse.cloud/signUp). ## ClickHouse Cloud benefits {#clickhouse-cloud-benefits} diff --git a/docs/about-us/distinctive-features.md b/docs/about-us/distinctive-features.md index 75e4413cb47..e314e01cad4 100644 --- a/docs/about-us/distinctive-features.md +++ b/docs/about-us/distinctive-features.md @@ -73,9 +73,9 @@ In ClickHouse "low latency" means that queries can be processed without delay an ClickHouse provides various ways to trade accuracy for performance: -1. Aggregate functions for approximated calculation of the number of distinct values, medians, and quantiles. -2. Running a query based on a part ([SAMPLE](../sql-reference/statements/select/sample.md)) of data and getting an approximated result. In this case, proportionally less data is retrieved from the disk. -3. Running an aggregation for a limited number of random keys, instead of for all keys. Under certain conditions for key distribution in the data, this provides a reasonably accurate result while using fewer resources. +1. Aggregate functions for approximated calculation of the number of distinct values, medians, and quantiles. +2. Running a query based on a part ([SAMPLE](../sql-reference/statements/select/sample.md)) of data and getting an approximated result. In this case, proportionally less data is retrieved from the disk. +3. Running an aggregation for a limited number of random keys, instead of for all keys. Under certain conditions for key distribution in the data, this provides a reasonably accurate result while using fewer resources. ## Adaptive join algorithm {#adaptive-join-algorithm} @@ -93,6 +93,6 @@ ClickHouse implements user account management using SQL queries and allows for [ ## Features that can be considered disadvantages {#clickhouse-features-that-can-be-considered-disadvantages} -1. No full-fledged transactions. -2. Lack of ability to modify or delete already inserted data with a high rate and low latency. There are batch deletes and updates available to clean up or modify data, for example, to comply with [GDPR](https://gdpr-info.eu). -3. The sparse index makes ClickHouse not so efficient for point queries retrieving single rows by their keys. +1. No full-fledged transactions. +2. Lack of ability to modify or delete already inserted data with a high rate and low latency. There are batch deletes and updates available to clean up or modify data, for example, to comply with [GDPR](https://gdpr-info.eu). +3. The sparse index makes ClickHouse not so efficient for point queries retrieving single rows by their keys. diff --git a/docs/about-us/history.md b/docs/about-us/history.md index 9a888930af6..cc4794734ce 100644 --- a/docs/about-us/history.md +++ b/docs/about-us/history.md @@ -28,7 +28,7 @@ ClickHouse also plays a key role in the following processes: - Running queries for debugging the Yandex.Metrica engine. - Analyzing logs from the API and the user interface. -Nowadays, there are a multiple dozen ClickHouse installations in other Yandex services and departments: search verticals, e-commerce, advertisement, business analytics, mobile development, personal services, and others. + Nowadays, there are a multiple dozen ClickHouse installations in other Yandex services and departments: search verticals, e-commerce, advertisement, business analytics, mobile development, personal services, and others. ## Aggregated and non-aggregated data {#aggregated-and-non-aggregated-data} @@ -45,13 +45,12 @@ However data aggregation comes with a lot of limitations: - Users do not view all the reports we generate for them. A large portion of those calculations are useless. - The logical integrity of the data may be violated for various aggregations. -If we do not aggregate anything and work with non-aggregated data, this might reduce the volume of calculations. + If we do not aggregate anything and work with non-aggregated data, this might reduce the volume of calculations. -However, with aggregation, a significant part of the work is taken offline and completed relatively calmly. In contrast, online calculations require calculating as fast as possible, since the user is waiting for the result. + However, with aggregation, a significant part of the work is taken offline and completed relatively calmly. In contrast, online calculations require calculating as fast as possible, since the user is waiting for the result. -Yandex.Metrica has a specialized system for aggregating data called Metrage, which was used for the majority of reports. -Starting in 2009, Yandex.Metrica also used a specialized OLAP database for non-aggregated data called OLAPServer, which was previously used for the report builder. -OLAPServer worked well for non-aggregated data, but it had many restrictions that did not allow it to be used for all reports as desired. These included a lack of support for data types (numbers only), and the inability to incrementally update data in real-time (it could only be done by rewriting data daily). OLAPServer is not a DBMS, but a specialized DB. - -The initial goal for ClickHouse was to remove the limitations of OLAPServer and solve the problem of working with non-aggregated data for all reports, but over the years, it has grown into a general-purpose database management system suitable for a wide range of analytical tasks. + Yandex.Metrica has a specialized system for aggregating data called Metrage, which was used for the majority of reports. + Starting in 2009, Yandex.Metrica also used a specialized OLAP database for non-aggregated data called OLAPServer, which was previously used for the report builder. + OLAPServer worked well for non-aggregated data, but it had many restrictions that did not allow it to be used for all reports as desired. These included a lack of support for data types (numbers only), and the inability to incrementally update data in real-time (it could only be done by rewriting data daily). OLAPServer is not a DBMS, but a specialized DB. + The initial goal for ClickHouse was to remove the limitations of OLAPServer and solve the problem of working with non-aggregated data for all reports, but over the years, it has grown into a general-purpose database management system suitable for a wide range of analytical tasks. diff --git a/docs/about-us/support.md b/docs/about-us/support.md index fb02f17832c..0292d0ab890 100644 --- a/docs/about-us/support.md +++ b/docs/about-us/support.md @@ -10,7 +10,7 @@ description: 'Information on ClickHouse Cloud support services' ClickHouse provides Support Services for our ClickHouse Cloud users and customers. Our objective is a Support Services team that represents the ClickHouse product – unparalleled performance, ease of use, and exceptionally fast, high-quality results. For details, [visit our ClickHouse Support Program](https://clickhouse.com/support/program/) page. -[Login to the Cloud console](https://console.clickhouse.cloud/support) and select **Help -> Support** from the menu options to open a new support case and view the status of your submitted cases. +[Login to the Cloud console](https://console.clickhouse.cloud/support) and select **Help -> Support** from the menu options to open a new support case and view the status of your submitted cases. You can also subscribe to our [status page](https://status.clickhouse.com) to get notified quickly about any incidents affecting our platform. @@ -19,4 +19,4 @@ Please note that only Subscription Customers have a Service Level Agreement on S - [ClickHouse Community Slack Channel](https://clickhouse.com/slack) - [Other Community Options](https://github.com/ClickHouse/ClickHouse/blob/master/README.md#useful-links) -::: + ::: diff --git a/docs/architecture/cluster-deployment.md b/docs/architecture/cluster-deployment.md index 4b7a2f81068..423d3e9b23c 100644 --- a/docs/architecture/cluster-deployment.md +++ b/docs/architecture/cluster-deployment.md @@ -14,17 +14,17 @@ By going through this tutorial, you'll learn how to set up a simple ClickHouse c This ClickHouse cluster will be a homogeneous cluster. Here are the steps: -1. Install ClickHouse server on all machines of the cluster -2. Set up cluster configs in configuration files -3. Create local tables on each instance -4. Create a [Distributed table](../engines/table-engines/special/distributed.md) +1. Install ClickHouse server on all machines of the cluster +2. Set up cluster configs in configuration files +3. Create local tables on each instance +4. Create a [Distributed table](../engines/table-engines/special/distributed.md) -A [distributed table](../engines/table-engines/special/distributed.md) is a kind of "view" to the local tables in a ClickHouse cluster. A SELECT query from a distributed table executes using resources of all cluster's shards. You may specify configs for multiple clusters and create multiple distributed tables to provide views for different clusters. + A [distributed table](../engines/table-engines/special/distributed.md) is a kind of "view" to the local tables in a ClickHouse cluster. A SELECT query from a distributed table executes using resources of all cluster's shards. You may specify configs for multiple clusters and create multiple distributed tables to provide views for different clusters. -Here is an example config for a cluster with three shards, with one replica each: + Here is an example config for a cluster with three shards, with one replica each: -```xml - + ```xml + @@ -45,40 +45,40 @@ Here is an example config for a cluster with three shards, with one replica each - -``` + + ``` -For further demonstration, let's create a new local table with the same `CREATE TABLE` query that we used for `hits_v1` in the single node deployment tutorial, but with a different table name: + For further demonstration, let's create a new local table with the same `CREATE TABLE` query that we used for `hits_v1` in the single node deployment tutorial, but with a different table name: -```sql -CREATE TABLE tutorial.hits_local (...) ENGINE = MergeTree() ... -``` + ```sql + CREATE TABLE tutorial.hits_local (...) ENGINE = MergeTree() ... + ``` -Creating a distributed table provides a view into the local tables of the cluster: + Creating a distributed table provides a view into the local tables of the cluster: -```sql -CREATE TABLE tutorial.hits_all AS tutorial.hits_local -ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand()); -``` + ```sql + CREATE TABLE tutorial.hits_all AS tutorial.hits_local + ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand()); + ``` -A common practice is to create similar distributed tables on all machines of the cluster. This allows running distributed queries on any machine of the cluster. There's also an alternative option to create a temporary distributed table for a given SELECT query using [remote](../sql-reference/table-functions/remote.md) table function. + A common practice is to create similar distributed tables on all machines of the cluster. This allows running distributed queries on any machine of the cluster. There's also an alternative option to create a temporary distributed table for a given SELECT query using [remote](../sql-reference/table-functions/remote.md) table function. -Let's run [INSERT SELECT](../sql-reference/statements/insert-into.md) into the distributed table to spread the table to multiple servers. + Let's run [INSERT SELECT](../sql-reference/statements/insert-into.md) into the distributed table to spread the table to multiple servers. -```sql -INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1; -``` + ```sql + INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1; + ``` -As you would expect, computationally heavy queries run N times faster if they utilize 3 servers instead of one. + As you would expect, computationally heavy queries run N times faster if they utilize 3 servers instead of one. -In this case, we use a cluster with 3 shards, and each shard contains a single replica. + In this case, we use a cluster with 3 shards, and each shard contains a single replica. -To provide resilience in a production environment, we recommend that each shard contain 2-3 replicas spread between multiple availability zones or datacenters (or at least racks). Note that ClickHouse supports an unlimited number of replicas. + To provide resilience in a production environment, we recommend that each shard contain 2-3 replicas spread between multiple availability zones or datacenters (or at least racks). Note that ClickHouse supports an unlimited number of replicas. -Here is an example config for a cluster of one shard containing three replicas: + Here is an example config for a cluster of one shard containing three replicas: -```xml - + ```xml + ... @@ -96,19 +96,19 @@ Here is an example config for a cluster of one shard containing three replicas: - -``` + + ``` -To enable native replication [ZooKeeper](http://zookeeper.apache.org/), is required. ClickHouse takes care of data consistency on all replicas and runs a restore procedure after a failure automatically. It's recommended to deploy the ZooKeeper cluster on separate servers (where no other processes including ClickHouse are running). + To enable native replication [ZooKeeper](http://zookeeper.apache.org/), is required. ClickHouse takes care of data consistency on all replicas and runs a restore procedure after a failure automatically. It's recommended to deploy the ZooKeeper cluster on separate servers (where no other processes including ClickHouse are running). -:::note Note -ZooKeeper is not a strict requirement: in some simple cases, you can duplicate the data by writing it into all the replicas from your application code. This approach is **not** recommended, as in this case, ClickHouse won't be able to guarantee data consistency on all replicas. Thus, it becomes the responsibility of your application. -::: + :::note Note + ZooKeeper is not a strict requirement: in some simple cases, you can duplicate the data by writing it into all the replicas from your application code. This approach is **not** recommended, as in this case, ClickHouse won't be able to guarantee data consistency on all replicas. Thus, it becomes the responsibility of your application. + ::: -ZooKeeper locations are specified in the configuration file: + ZooKeeper locations are specified in the configuration file: -```xml - + ```xml + zoo01.clickhouse.com 2181 @@ -121,33 +121,33 @@ ZooKeeper locations are specified in the configuration file: zoo03.clickhouse.com 2181 - -``` + + ``` -Also, we need to set macros for identifying each shard and replica which are used on table creation: + Also, we need to set macros for identifying each shard and replica which are used on table creation: -```xml - + ```xml + 01 01 - -``` + + ``` -If there are no replicas at the moment of replicated table creation, a new first replica is instantiated. If there are already live replicas, the new replica clones data from existing ones. You have an option to create all replicated tables first, and then insert data to it. Another option is to create some replicas and add the others after or during data insertion. + If there are no replicas at the moment of replicated table creation, a new first replica is instantiated. If there are already live replicas, the new replica clones data from existing ones. You have an option to create all replicated tables first, and then insert data to it. Another option is to create some replicas and add the others after or during data insertion. -```sql -CREATE TABLE tutorial.hits_replica (...) -ENGINE = ReplicatedMergeTree( + ```sql + CREATE TABLE tutorial.hits_replica (...) + ENGINE = ReplicatedMergeTree( '/clickhouse_perftest/tables/{shard}/hits', '{replica}' -) -... -``` + ) + ... + ``` -Here we use the [ReplicatedMergeTree](../engines/table-engines/mergetree-family/replication.md) table engine. In parameters, we specify the ZooKeeper path containing the shard and replica identifiers. + Here we use the [ReplicatedMergeTree](../engines/table-engines/mergetree-family/replication.md) table engine. In parameters, we specify the ZooKeeper path containing the shard and replica identifiers. -```sql -INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local; -``` + ```sql + INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local; + ``` -Replication operates in multi-master mode. Data can be loaded into any replica, and the system then syncs it with other instances automatically. Replication is asynchronous so at a given moment, not all replicas may contain recently inserted data. At least one replica should be up to allow for data ingestion. Others will sync up data and repair consistency once they become active again. Note that this approach allows for the low possibility of loss of recently inserted data. + Replication operates in multi-master mode. Data can be loaded into any replica, and the system then syncs it with other instances automatically. Replication is asynchronous so at a given moment, not all replicas may contain recently inserted data. At least one replica should be up to allow for data ingestion. Others will sync up data and repair consistency once they become active again. Note that this approach allows for the low possibility of loss of recently inserted data. diff --git a/docs/best-practices/_snippets/_async_inserts.md b/docs/best-practices/_snippets/_async_inserts.md index 186e699c890..62dd53339fc 100644 --- a/docs/best-practices/_snippets/_async_inserts.md +++ b/docs/best-practices/_snippets/_async_inserts.md @@ -13,25 +13,25 @@ The core behavior is controlled via the [`async_insert`](/operations/settings/se Async inserts -When enabled (1), inserts are buffered and only written to disk once one of the flush conditions is met: +When enabled (1), inserts are buffered and only written to disk once one of the flush conditions is met: (1) the buffer reaches a specified size (async_insert_max_data_size) -(2) a time threshold elapses (async_insert_busy_timeout_ms) or -(3) a maximum number of insert queries accumulate (async_insert_max_query_number). +(2) a time threshold elapses (async_insert_busy_timeout_ms) or +(3) a maximum number of insert queries accumulate (async_insert_max_query_number). This batching process is invisible to clients and helps ClickHouse efficiently merge insert traffic from multiple sources. However, until a flush occurs, the data cannot be queried. Importantly, there are multiple buffers per insert shape and settings combination, and in clusters, buffers are maintained per node - enabling fine-grained control across multi-tenant environments. Insert mechanics are otherwise identical to those described for [synchronous inserts](/best-practices/selecting-an-insert-strategy#synchronous-inserts-by-default). ### Choosing a return mode {#choosing-a-return-mode} -The behavior of asynchronous inserts is further refined using the [`wait_for_async_insert`](/operations/settings/settings#wait_for_async_insert) setting. +The behavior of asynchronous inserts is further refined using the [`wait_for_async_insert`](/operations/settings/settings#wait_for_async_insert) setting. -When set to 1 (the default), ClickHouse only acknowledges the insert after the data is successfully flushed to disk. This ensures strong durability guarantees and makes error handling straightforward: if something goes wrong during the flush, the error is returned to the client. This mode is recommended for most production scenarios, especially when insert failures must be tracked reliably. +When set to 1 (the default), ClickHouse only acknowledges the insert after the data is successfully flushed to disk. This ensures strong durability guarantees and makes error handling straightforward: if something goes wrong during the flush, the error is returned to the client. This mode is recommended for most production scenarios, especially when insert failures must be tracked reliably. [Benchmarks](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse) show it scales well with concurrency - whether you're running 200 or 500 clients- thanks to adaptive inserts and stable part creation behavior. -Setting `wait_for_async_insert = 0` enables "fire-and-forget" mode. Here, the server acknowledges the insert as soon as the data is buffered, without waiting for it to reach storage. +Setting `wait_for_async_insert = 0` enables "fire-and-forget" mode. Here, the server acknowledges the insert as soon as the data is buffered, without waiting for it to reach storage. -This offers ultra-low-latency inserts and maximal throughput, ideal for high-velocity, low-criticality data. However, this comes with trade-offs: there's no guarantee the data will be persisted, errors may only surface during flush, and it's difficult to trace failed inserts. Use this mode only if your workload can tolerate data loss. +This offers ultra-low-latency inserts and maximal throughput, ideal for high-velocity, low-criticality data. However, this comes with trade-offs: there's no guarantee the data will be persisted, errors may only surface during flush, and it's difficult to trace failed inserts. Use this mode only if your workload can tolerate data loss. [Benchmarks also demonstrate](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse) substantial part reduction and lower CPU usage when buffer flushes are infrequent (e.g. every 30 seconds), but the risk of silent failure remains. @@ -39,7 +39,7 @@ Our strong recommendation is to use `async_insert=1,wait_for_async_insert=1` if ### Deduplication and reliability {#deduplication-and-reliability} -By default, ClickHouse performs automatic deduplication for synchronous inserts, which makes retries safe in failure scenarios. However, this is disabled for asynchronous inserts unless explicitly enabled (this should not be enabled if you have dependent materialized views - [see issue](https://github.com/ClickHouse/ClickHouse/issues/66003)). +By default, ClickHouse performs automatic deduplication for synchronous inserts, which makes retries safe in failure scenarios. However, this is disabled for asynchronous inserts unless explicitly enabled (this should not be enabled if you have dependent materialized views - [see issue](https://github.com/ClickHouse/ClickHouse/issues/66003)). In practice, if deduplication is turned on and the same insert is retried - due to, for instance, a timeout or network drop - ClickHouse can safely ignore the duplicate. This helps maintain idempotency and avoids double-writing data. Still, it's worth noting that insert validation and schema parsing happen only during buffer flush - so errors (like type mismatches) will only surface at that point. @@ -48,17 +48,16 @@ In practice, if deduplication is turned on and the same insert is retried - due Asynchronous inserts can be enabled for a particular user, or for a specific query: - Enabling asynchronous inserts at the user level. This example uses the user `default`, if you create a different user then substitute that username: - ```sql - ALTER USER default SETTINGS async_insert = 1 - ``` + ```sql + ALTER USER default SETTINGS async_insert = 1 + ``` - You can specify the asynchronous insert settings by using the SETTINGS clause of insert queries: - ```sql - INSERT INTO YourTable SETTINGS async_insert=1, wait_for_async_insert=1 VALUES (...) - ``` + ```sql + INSERT INTO YourTable SETTINGS async_insert=1, wait_for_async_insert=1 VALUES (...) + ``` - You can also specify asynchronous insert settings as connection parameters when using a ClickHouse programming language client. - As an example, this is how you can do that within a JDBC connection string when you use the ClickHouse Java JDBC driver for connecting to ClickHouse Cloud : - ```bash - "jdbc:ch://HOST.clickhouse.cloud:8443/?user=default&password=PASSWORD&ssl=true&custom_http_params=async_insert=1,wait_for_async_insert=1" - ``` - + As an example, this is how you can do that within a JDBC connection string when you use the ClickHouse Java JDBC driver for connecting to ClickHouse Cloud : + ```bash + "jdbc:ch://HOST.clickhouse.cloud:8443/?user=default&password=PASSWORD&ssl=true&custom_http_params=async_insert=1,wait_for_async_insert=1" + ``` diff --git a/docs/best-practices/_snippets/_avoid_mutations.md b/docs/best-practices/_snippets/_avoid_mutations.md index 9bfd2ce741c..f6507dc33eb 100644 --- a/docs/best-practices/_snippets/_avoid_mutations.md +++ b/docs/best-practices/_snippets/_avoid_mutations.md @@ -1,8 +1,8 @@ -In ClickHouse, **mutations** refer to operations that modify or delete existing data in a table - typically using `ALTER TABLE ... DELETE` or `ALTER TABLE ... UPDATE`. While these statements may appear similar to standard SQL operations, they are fundamentally different under the hood. +In ClickHouse, **mutations** refer to operations that modify or delete existing data in a table - typically using `ALTER TABLE ... DELETE` or `ALTER TABLE ... UPDATE`. While these statements may appear similar to standard SQL operations, they are fundamentally different under the hood. Rather than modifying rows in place, mutations in ClickHouse are asynchronous background processes that rewrite entire [data parts](/parts) affected by the change. This approach is necessary due to ClickHouse's column-oriented, immutable storage model, but it can lead to significant I/O and resource usage. -When a mutation is issued, ClickHouse schedules the creation of new **mutated parts**, leaving the original parts untouched until the new ones are ready. Once ready, the mutated parts atomically replace the originals. However, because the operation rewrites entire parts, even a minor change (such as updating a single row) may result in large-scale rewrites and excessive write amplification. +When a mutation is issued, ClickHouse schedules the creation of new **mutated parts**, leaving the original parts untouched until the new ones are ready. Once ready, the mutated parts atomically replace the originals. However, because the operation rewrites entire parts, even a minor change (such as updating a single row) may result in large-scale rewrites and excessive write amplification. For large datasets, this can produce a substantial spike in disk I/O and degrade overall cluster performance. Unlike merges, mutations can't be rolled back once submitted and will continue to execute even after server restarts unless explicitly cancelled - see [`KILL MUTATION`](/sql-reference/statements/kill#kill-mutation). diff --git a/docs/best-practices/_snippets/_avoid_optimize_final.md b/docs/best-practices/_snippets/_avoid_optimize_final.md index 9cc119a9bd9..ff4cbe759b6 100644 --- a/docs/best-practices/_snippets/_avoid_optimize_final.md +++ b/docs/best-practices/_snippets/_avoid_optimize_final.md @@ -1,8 +1,7 @@ import Image from '@theme/IdealImage'; import simple_merges from '@site/static/images/bestpractices/simple_merges.png'; - -ClickHouse tables using the **MergeTree engine** store data on disk as **immutable parts**, which are created every time data is inserted. +ClickHouse tables using the **MergeTree engine** store data on disk as **immutable parts**, which are created every time data is inserted. Each insert creates a new part containing sorted, compressed column files, along with metadata like indexes and checksums. For a detailed description of part structures and how they are formed we recommend this [guide](/parts). @@ -29,7 +28,7 @@ Running `OPTIMIZE FINAL` forces ClickHouse to merge **all** active parts into a 3. **Compressing** it again 4. **Writing** the final part to disk or object storage -These steps are **CPU and I/O-intensive** and can put significant strain on your system, especially when large datasets are involved. + These steps are **CPU and I/O-intensive** and can put significant strain on your system, especially when large datasets are involved. ### It ignores safety limits {#it-ignores-safety-limits} diff --git a/docs/best-practices/_snippets/_bulk_inserts.md b/docs/best-practices/_snippets/_bulk_inserts.md index 313f4113463..844ec42f0ce 100644 --- a/docs/best-practices/_snippets/_bulk_inserts.md +++ b/docs/best-practices/_snippets/_bulk_inserts.md @@ -1,11 +1,11 @@ The above mechanics illustrate a constant overhead regardless of the insert size, making batch size the single most important optimization for ingest throughput. Batching inserts reduce the overhead as a proportion of total insert time and improves processing efficiency. -We recommend inserting data in batches of at least 1,000 rows, and ideally between 10,000–100,000 rows. Fewer, larger inserts reduce the number of parts written, minimize merge load, and lower overall system resource usage. +We recommend inserting data in batches of at least 1,000 rows, and ideally between 10,000–100,000 rows. Fewer, larger inserts reduce the number of parts written, minimize merge load, and lower overall system resource usage. **For a synchronous insert strategy to be effective this client-side batching is required.** If you're unable to batch data client-side, ClickHouse supports asynchronous inserts that shift batching to the server ([see](/best-practices/selecting-an-insert-strategy#asynchronous-inserts)). -:::tip -Regardless of the size of your inserts, we recommend keeping the number of insert queries around one insert query per second. The reason for that recommendation is that the created parts are merged to larger parts in the background (in order to optimize your data for read queries), and sending too many insert queries per second can lead to situations where the background merging can't keep up with the number of new parts. However, you can use a higher rate of insert queries per second when you use asynchronous inserts (see asynchronous inserts). +:::tip +Regardless of the size of your inserts, we recommend keeping the number of insert queries around one insert query per second. The reason for that recommendation is that the created parts are merged to larger parts in the background (in order to optimize your data for read queries), and sending too many insert queries per second can lead to situations where the background merging can't keep up with the number of new parts. However, you can use a higher rate of insert queries per second when you use asynchronous inserts (see asynchronous inserts). ::: diff --git a/docs/best-practices/choosing_a_primary_key.md b/docs/best-practices/choosing_a_primary_key.md index 822456aa849..8c85b20d7b3 100644 --- a/docs/best-practices/choosing_a_primary_key.md +++ b/docs/best-practices/choosing_a_primary_key.md @@ -12,22 +12,19 @@ import Image from '@theme/IdealImage'; import create_primary_key from '@site/static/images/bestpractices/create_primary_key.gif'; import primary_key from '@site/static/images/bestpractices/primary_key.gif'; - > We interchangeably use the term "ordering key" to refer to the "primary key" on this page. Strictly, [these differ in ClickHouse](/engines/table-engines/mergetree-family/mergetree#choosing-a-primary-key-that-differs-from-the-sorting-key), but for the purposes of this document, readers can use them interchangeably, with the ordering key referring to the columns specified in the table `ORDER BY`. Note that a ClickHouse primary key works [very differently](/migrations/postgresql/data-modeling-techniques#primary-ordering-keys-in-clickhouse) to those familiar with similar terms in OLTP databases such as Postgres. Choosing an effective primary key in ClickHouse is crucial for query performance and storage efficiency. ClickHouse organizes data into parts, each containing its own sparse primary index. This index significantly speeds up queries by reducing the volume of data scanned. Additionally, because the primary key determines the physical order of data on disk, it directly impacts compression efficiency. Optimally ordered data compresses more effectively, which further enhances performance by reducing I/O. - 1. When selecting an ordering key, prioritize columns frequently used in query filters (i.e. the `WHERE` clause), especially those that exclude large numbers of rows. 2. Columns highly correlated with other data in the table are also beneficial, as contiguous storage improves compression ratios and memory efficiency during `GROUP BY` and `ORDER BY` operations. -
-Some simple rules can be applied to help choose an ordering key. The following can sometimes be in conflict, so consider these in order. **Users can identify a number of keys from this process, with 4-5 typically sufficient**: + Some simple rules can be applied to help choose an ordering key. The following can sometimes be in conflict, so consider these in order. **Users can identify a number of keys from this process, with 4-5 typically sufficient**: -:::note Important -Ordering keys must be defined on table creation and cannot be added. Additional ordering can be added to a table after (or before) data insertion through a feature known as projections. Be aware these result in data duplication. Further details [here](/sql-reference/statements/alter/projection). -::: + :::note Important + Ordering keys must be defined on table creation and cannot be added. Additional ordering can be added to a table after (or before) data insertion through a feature known as projections. Be aware these result in data duplication. Further details [here](/sql-reference/statements/alter/projection). + ::: ## Example {#example} @@ -39,7 +36,7 @@ This table has no primary key - as indicated by `ORDER BY tuple()`. CREATE TABLE posts_unordered ( `Id` Int32, - `PostTypeId` Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, + `PostTypeId` Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, 'WikiPlaceholder' = 7, 'PrivilegeWiki' = 8), `AcceptedAnswerId` UInt32, `CreationDate` DateTime, @@ -107,7 +104,7 @@ Assume a table `posts_ordered`, containing the same data, is defined with an `OR CREATE TABLE posts_ordered ( `Id` Int32, - `PostTypeId` Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, + `PostTypeId` Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, 'WikiPlaceholder' = 7, 'PrivilegeWiki' = 8), ... ) @@ -115,7 +112,7 @@ ENGINE = MergeTree ORDER BY (PostTypeId, toDate(CreationDate)) ``` -`PostTypeId` has a cardinality of 8 and represents the logical choice for the first entry in our ordering key. Recognizing date granularity filtering is likely to be sufficient (it will still benefit datetime filters) so we use `toDate(CreationDate)` as the 2nd component of our key. This will also produce a smaller index as a date can be represented by 16 bits, speeding up filtering. +`PostTypeId` has a cardinality of 8 and represents the logical choice for the first entry in our ordering key. Recognizing date granularity filtering is likely to be sufficient (it will still benefit datetime filters) so we use `toDate(CreationDate)` as the 2nd component of our key. This will also produce a smaller index as a date can be represented by 16 bits, speeding up filtering. The following animation shows how an optimized sparse primary index is created for the Stack Overflow posts table. Instead of indexing individual rows, the index targets blocks of rows: @@ -135,7 +132,7 @@ WHERE (CreationDate >= '2024-01-01') AND (PostTypeId = 'Question') 1 row in set. Elapsed: 0.013 sec. Processed 196.53 thousand rows, 1.77 MB (14.64 million rows/s., 131.78 MB/s.) ``` -This query now leverages sparse indexing, significantly reducing the amount of data read and speeding up the execution time by 4x - note the reduction of rows and bytes read. +This query now leverages sparse indexing, significantly reducing the amount of data read and speeding up the execution time by 4x - note the reduction of rows and bytes read. The use of the index can be confirmed with an `EXPLAIN indexes=1`. diff --git a/docs/best-practices/json_type.md b/docs/best-practices/json_type.md index 4accb95351d..cef254571d3 100644 --- a/docs/best-practices/json_type.md +++ b/docs/best-practices/json_type.md @@ -18,25 +18,25 @@ Use the JSON type when your data: * Contains **values with varying types** (e.g., a path might sometimes contain a string, sometimes a number). * Requires schema flexibility where strict typing isn't viable. -If your data structure is known and consistent, there is rarely a need for the JSON type, even if your data is in JSON format. Specifically, if your data has: + If your data structure is known and consistent, there is rarely a need for the JSON type, even if your data is in JSON format. Specifically, if your data has: * **A flat structure with known keys**: use standard column types e.g. String. * **Predictable nesting**: use Tuple, Array, or Nested types for these structures. * **Predictable structure with varying types**: consider Dynamic or Variant types instead. -You can also mix approaches - for example, use static columns for predictable top-level fields and a single JSON column for a dynamic section of the payload. + You can also mix approaches - for example, use static columns for predictable top-level fields and a single JSON column for a dynamic section of the payload. ## Considerations and tips for using JSON {#considerations-and-tips-for-using-json} The JSON type enables efficient columnar storage by flattening paths into subcolumns. But with flexibility comes responsibility. To use it effectively: -* **Specify path types** using [hints in the column definition](/sql-reference/data-types/newjson) to specify types for known sub columns, avoiding unnecessary type inference. +* **Specify path types** using [hints in the column definition](/sql-reference/data-types/newjson) to specify types for known sub columns, avoiding unnecessary type inference. * **Skip paths** if you don't need the values, with [SKIP and SKIP REGEXP](/sql-reference/data-types/newjson) to reduce storage and improve performance. * **Avoid setting [`max_dynamic_paths`](/sql-reference/data-types/newjson#reaching-the-limit-of-dynamic-paths-inside-json) too high** - large values increase resource consumption and reduce efficiency. As a rule of thumb, keep it below 10,000. -:::note Type hints -Type hits offer more than just a way to avoid unnecessary type inference - they eliminate storage and processing indirection entirely. JSON paths with type hints are always stored just like traditional columns, bypassing the need for [**discriminator columns**](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse#storage-extension-for-dynamically-changing-data) or dynamic resolution during query time. This means that with well-defined type hints, nested JSON fields achieve the same performance and efficiency as if they were modeled as top-level fields from the outset. As a result, for datasets that are mostly consistent but still benefit from the flexibility of JSON, type hints provide a convenient way to preserve performance without needing to restructure your schema or ingest pipeline. -::: + :::note Type hints + Type hits offer more than just a way to avoid unnecessary type inference - they eliminate storage and processing indirection entirely. JSON paths with type hints are always stored just like traditional columns, bypassing the need for [**discriminator columns**](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse#storage-extension-for-dynamically-changing-data) or dynamic resolution during query time. This means that with well-defined type hints, nested JSON fields achieve the same performance and efficiency as if they were modeled as top-level fields from the outset. As a result, for datasets that are mostly consistent but still benefit from the flexibility of JSON, type hints provide a convenient way to preserve performance without needing to restructure your schema or ingest pipeline. + ::: ## Advanced features {#advanced-features} @@ -45,7 +45,7 @@ Type hits offer more than just a way to avoid unnecessary type inference - they * You can read nested sub-objects using the `.^` syntax. * Query syntax may differ from standard SQL and may require special casting or operators for nested fields. -For additional guidance, see[ ClickHouse JSON documentation](/sql-reference/data-types/newjson) or explore our blog post[ A New Powerful JSON Data Type for ClickHouse](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse). + For additional guidance, see[ ClickHouse JSON documentation](/sql-reference/data-types/newjson) or explore our blog post[ A New Powerful JSON Data Type for ClickHouse](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse). ## Examples {#examples} @@ -151,7 +151,7 @@ ORDER BY update_date Again we can insert the data as JSON: ```sql -INSERT INTO arxiv FORMAT JSONEachRow +INSERT INTO arxiv FORMAT JSONEachRow {"id":"2101.11408","submitter":"Daniel Lemire","authors":"Daniel Lemire","title":"Number Parsing at a Gigabyte per Second","comments":"Software at https://github.com/fastfloat/fast_float and\n https://github.com/lemire/simple_fastfloat_benchmark/","journal-ref":"Software: Practice and Experience 51 (8), 2021","doi":"10.1002/spe.2984","report-no":null,"categories":"cs.DS cs.MS","license":"http://creativecommons.org/licenses/by/4.0/","abstract":"With disks and networks providing gigabytes per second ....\n","versions":[{"created":"Mon, 11 Jan 2021 20:31:27 GMT","version":"v1"},{"created":"Sat, 30 Jan 2021 23:57:29 GMT","version":"v2"}],"update_date":"2022-11-07","authors_parsed":[["Lemire","Daniel",""]]} ``` @@ -211,7 +211,6 @@ Suppose another column called `tags` is added. If this was simply a list of stri In this case, we could model the arXiv documents as either all JSON or simply add a JSON `tags` column. We provide both examples below: - ```sql CREATE TABLE arxiv ( @@ -228,7 +227,7 @@ We provide a type hint for the `update_date` column in the JSON definition, as w We can insert into this table and view the subsequently inferred schema using the [`JSONAllPathsWithTypes`](/sql-reference/functions/json-functions#jsonallpathswithtypes) function and [`PrettyJSONEachRow`](/interfaces/formats/PrettyJSONEachRow) output format: ```sql -INSERT INTO arxiv FORMAT JSONAsObject +INSERT INTO arxiv FORMAT JSONAsObject {"id":"2101.11408","submitter":"Daniel Lemire","authors":"Daniel Lemire","title":"Number Parsing at a Gigabyte per Second","comments":"Software at https://github.com/fastfloat/fast_float and\n https://github.com/lemire/simple_fastfloat_benchmark/","journal-ref":"Software: Practice and Experience 51 (8), 2021","doi":"10.1002/spe.2984","report-no":null,"categories":"cs.DS cs.MS","license":"http://creativecommons.org/licenses/by/4.0/","abstract":"With disks and networks providing gigabytes per second ....\n","versions":[{"created":"Mon, 11 Jan 2021 20:31:27 GMT","version":"v1"},{"created":"Sat, 30 Jan 2021 23:57:29 GMT","version":"v2"}],"update_date":"2022-11-07","authors_parsed":[["Lemire","Daniel",""]],"tags":{"tag_1":{"name":"ClickHouse user","score":"A+","comment":"A good read, applicable to ClickHouse"},"28_03_2025":{"name":"professor X","score":10,"comment":"Didn't learn much","updates":[{"name":"professor X","comment":"Wolverine found more interesting"}]}}} ``` @@ -291,7 +290,7 @@ ORDER BY update_date ``` ```sql -INSERT INTO arxiv FORMAT JSONEachRow +INSERT INTO arxiv FORMAT JSONEachRow {"id":"2101.11408","submitter":"Daniel Lemire","authors":"Daniel Lemire","title":"Number Parsing at a Gigabyte per Second","comments":"Software at https://github.com/fastfloat/fast_float and\n https://github.com/lemire/simple_fastfloat_benchmark/","journal-ref":"Software: Practice and Experience 51 (8), 2021","doi":"10.1002/spe.2984","report-no":null,"categories":"cs.DS cs.MS","license":"http://creativecommons.org/licenses/by/4.0/","abstract":"With disks and networks providing gigabytes per second ....\n","versions":[{"created":"Mon, 11 Jan 2021 20:31:27 GMT","version":"v1"},{"created":"Sat, 30 Jan 2021 23:57:29 GMT","version":"v2"}],"update_date":"2022-11-07","authors_parsed":[["Lemire","Daniel",""]],"tags":{"tag_1":{"name":"ClickHouse user","score":"A+","comment":"A good read, applicable to ClickHouse"},"28_03_2025":{"name":"professor X","score":10,"comment":"Didn't learn much","updates":[{"name":"professor X","comment":"Wolverine found more interesting"}]}}} ``` diff --git a/docs/best-practices/minimize_optimize_joins.md b/docs/best-practices/minimize_optimize_joins.md index 959f8e9e5a1..b90b23bcefe 100644 --- a/docs/best-practices/minimize_optimize_joins.md +++ b/docs/best-practices/minimize_optimize_joins.md @@ -20,9 +20,9 @@ In general, denormalize when: - Only a limited subset of the columns will be queried, i.e. certain columns can be excluded from denormalization. - You have the capability to shift processing out of ClickHouse into upstream systems like Flink, where real-time enrichment or flattening can be managed. -Not all data needs to be denormalized - focus on the attributes that are frequently queried. Also consider [materialized views](/best-practices/use-materialized-views) to incrementally compute aggregates instead of duplicating entire sub-tables. When schema updates are rare and latency is critical, denormalization offers the best performance trade-off. + Not all data needs to be denormalized - focus on the attributes that are frequently queried. Also consider [materialized views](/best-practices/use-materialized-views) to incrementally compute aggregates instead of duplicating entire sub-tables. When schema updates are rare and latency is critical, denormalization offers the best performance trade-off. -For a full guide on denormalizing data in ClickHouse see [here](/data-modeling/denormalization). + For a full guide on denormalizing data in ClickHouse see [here](/data-modeling/denormalization). ## When JOINs are required {#when-joins-are-required} @@ -37,10 +37,9 @@ Follow these best practices to improve JOIN performance: * **Avoid disk-spilling JOINs**: Intermediate states of JOINs (e.g. hash tables) can become so big that they no longer fit into main memory. In this situation, ClickHouse will return an out-of-memory error by default. Some join algorithms (see below), for example [`grace_hash`](https://clickhouse.com/blog/clickhouse-fully-supports-joins-hash-joins-part2), [`partial_merge`](https://clickhouse.com/blog/clickhouse-fully-supports-joins-full-sort-partial-merge-part3) and [`full_sorting_merge`](https://clickhouse.com/blog/clickhouse-fully-supports-joins-full-sort-partial-merge-part3), are able to spill intermediate states to disk and continue query execution. These join algorithms should nevertheless be used with care as disk access can significantly slow down join processing. We instead recommend optimizing the JOIN query in other ways to reduce the size of intermediate states. * **Default values as no-match markers in outer JOINs**: Left/right/full outer joins include all values from the left/right/both tables. If no join partner is found in the other table for some value, ClickHouse replaces the join partner by a special marker. The SQL standard mandates that databases use NULL as such a marker. In ClickHouse, this requires wrapping the result column in Nullable, creating an additional memory and performance overhead. As an alternative, you can configure the setting `join_use_nulls = 0` and use the default value of the result column data type as marker. - -:::note Use dictionaries carefully -When using dictionaries for JOINs in ClickHouse, it's important to understand that dictionaries, by design, do not allow duplicate keys. During data loading, any duplicate keys are silently deduplicated—only the last loaded value for a given key is retained. This behavior makes dictionaries ideal for one-to-one or many-to-one relationships where only the latest or authoritative value is needed. However, using a dictionary for a one-to-many or many-to-many relationship (e.g. joining roles to actors where an actor can have multiple roles) will result in silent data loss, as all but one of the matching rows will be discarded. As a result, dictionaries are not suitable for scenarios requiring full relational fidelity across multiple matches. -::: + :::note Use dictionaries carefully + When using dictionaries for JOINs in ClickHouse, it's important to understand that dictionaries, by design, do not allow duplicate keys. During data loading, any duplicate keys are silently deduplicated—only the last loaded value for a given key is retained. This behavior makes dictionaries ideal for one-to-one or many-to-one relationships where only the latest or authoritative value is needed. However, using a dictionary for a one-to-many or many-to-many relationship (e.g. joining roles to actors where an actor can have multiple roles) will result in silent data loss, as all but one of the matching rows will be discarded. As a result, dictionaries are not suitable for scenarios requiring full relational fidelity across multiple matches. + ::: ## Choosing the right JOIN Algorithm {#choosing-the-right-join-algorithm} @@ -52,18 +51,18 @@ ClickHouse supports several JOIN algorithms that trade off between speed and mem * **Partial Merge JOIN:** Minimizes memory but is slower—best for joining large tables with limited memory. * **Grace Hash JOIN:** Flexible and memory-tunable, good for large datasets with adjustable performance characteristics. -Joins - speed vs memory + Joins - speed vs memory -:::note -Each algorithm has varying support for JOIN types. A full list of supported join types for each algorithm can be found [here](/guides/joining-tables#choosing-a-join-algorithm). -::: + :::note + Each algorithm has varying support for JOIN types. A full list of supported join types for each algorithm can be found [here](/guides/joining-tables#choosing-a-join-algorithm). + ::: -You can let ClickHouse choose the best algorithm by setting `join_algorithm = 'auto'` (the default), or explicitly control it based on your workload. If you need to select a join algorithm to optimize for performance or memory overhead, we recommend [this guide](/guides/joining-tables#choosing-a-join-algorithm). + You can let ClickHouse choose the best algorithm by setting `join_algorithm = 'auto'` (the default), or explicitly control it based on your workload. If you need to select a join algorithm to optimize for performance or memory overhead, we recommend [this guide](/guides/joining-tables#choosing-a-join-algorithm). -For optimal performance: + For optimal performance: * Keep JOINs to a minimum in high-performance workloads. * Avoid more than 3–4 joins per query. * Benchmark different algorithms on real data - performance varies based on JOIN key distribution and data size. -For more on JOIN optimization strategies, JOIN algorithms, and how to tune them, refer to the[ ClickHouse documentation](/guides/joining-tables) and this [blog series](https://clickhouse.com/blog/clickhouse-fully-supports-joins-part1). + For more on JOIN optimization strategies, JOIN algorithms, and how to tune them, refer to the[ ClickHouse documentation](/guides/joining-tables) and this [blog series](https://clickhouse.com/blog/clickhouse-fully-supports-joins-part1). diff --git a/docs/best-practices/partitioning_keys.mdx b/docs/best-practices/partitioning_keys.mdx index c91376b4e09..50bbc0b391c 100644 --- a/docs/best-practices/partitioning_keys.mdx +++ b/docs/best-practices/partitioning_keys.mdx @@ -36,7 +36,6 @@ Whenever a set of rows is inserted into the table, instead of creating (at[ leas Partitions - The ClickHouse server first splits the rows from the example insert with 4 rows sketched in the diagram above by their partition key value `toStartOfMonth(date)`. Then, for each identified partition, the rows are processed as[ usual](/parts) by performing several sequential steps (① Sorting, ② Splitting into columns, ③ Compression, ④ Writing to Disk). For a more detailed explanation of partitioning, we recommend [this guide](/partitions). @@ -49,7 +48,7 @@ With partitioning enabled, ClickHouse only [merges](/merges) data parts within, Partitioning is a powerful tool for managing large datasets in ClickHouse, especially in observability and analytics use cases. It enables efficient data life cycle operations by allowing entire partitions, often aligned with time or business logic, to be dropped, moved, or archived in a single metadata operation. This is significantly faster and less resource-intensive than row-level delete or copy operations. Partitioning also integrates cleanly with ClickHouse features like TTL and tiered storage, making it possible to implement retention policies or hot/cold storage strategies without custom orchestration. For example, recent data can be kept on fast SSD-backed storage, while older partitions are automatically moved to cheaper object storage. -While partitioning can improve query performance for some workloads, it can also negatively impact response time. +While partitioning can improve query performance for some workloads, it can also negatively impact response time. If the partitioning key is not in the primary key and you are filtering by it, users may see an improvement in query performance with partitioning. See [here](/partitions#query-optimization) for an example. diff --git a/docs/best-practices/select_data_type.md b/docs/best-practices/select_data_type.md index fa55d0a8a45..0033d6f508e 100644 --- a/docs/best-practices/select_data_type.md +++ b/docs/best-practices/select_data_type.md @@ -15,7 +15,6 @@ Compression efficiency in ClickHouse depends mainly on three factors: the orderi Some straightforward guidelines can significantly enhance the schema: - * **Use Strict Types:** Always select the correct data type for columns. Numeric and date fields should use appropriate numeric and date types rather than general-purpose String types. This ensures correct semantics for filtering and aggregations. * **Avoid nullable Columns:** Nullable columns introduce additional overhead by maintaining separate columns for tracking null values. Only use Nullable if explicitly required to distinguish between empty and null states. Otherwise, default or zero-equivalent values typically suffice. For further information on why this type should be avoided unless needed, see [Avoid nullable Columns](/best-practices/select-data-types#avoid-nullable-columns). @@ -30,13 +29,12 @@ Some straightforward guidelines can significantly enhance the schema: ## Example {#example} -ClickHouse offers built-in tools to streamline type optimization. For example, schema inference can automatically identify initial types. Consider the Stack Overflow dataset, publicly available in Parquet format. Running a simple schema inference via the [`DESCRIBE`](/sql-reference/statements/describe-table) command provides an initial non-optimized schema. +ClickHouse offers built-in tools to streamline type optimization. For example, schema inference can automatically identify initial types. Consider the Stack Overflow dataset, publicly available in Parquet format. Running a simple schema inference via the [`DESCRIBE`](/sql-reference/statements/describe-table) command provides an initial non-optimized schema. :::note By default, ClickHouse maps these to equivalent Nullable types. This is preferred as the schema is based on a sample of the rows only. ::: - ```sql DESCRIBE TABLE s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/*.parquet') SETTINGS describe_compact_output = 1 @@ -109,7 +107,7 @@ This results in the following optimized schema (with respect to types): CREATE TABLE posts ( Id Int32, - PostTypeId Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, + PostTypeId Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, 'WikiPlaceholder' = 7, 'PrivilegeWiki' = 8), AcceptedAnswerId UInt32, CreationDate DateTime, diff --git a/docs/best-practices/selecting_an_insert_strategy.md b/docs/best-practices/selecting_an_insert_strategy.md index f9ffca9edf5..5c1afd5c731 100644 --- a/docs/best-practices/selecting_an_insert_strategy.md +++ b/docs/best-practices/selecting_an_insert_strategy.md @@ -50,7 +50,6 @@ After ⑥ receiving the data, ClickHouse ⑦ decompresses it if compression was Using the values from that formatted data and the target table's [DDL](/sql-reference/statements/create/table) statement, ClickHouse ⑨ builds an in-memory [block](/development/architecture#block) in the MergeTree format, ⑩ [sorts](/parts#what-are-table-parts-in-clickhouse) rows by the primary key columns if they are not already pre-sorted, ⑪ creates a [sparse primary index](/guides/best-practices/sparse-primary-indexes), ⑫ applies [per-column compression](/parts#what-are-table-parts-in-clickhouse), and ⑬ writes the data as a new ⑭ [data part](/parts) to disk. - ### Batch inserts if synchronous {#batch-inserts-if-synchronous} @@ -62,7 +61,7 @@ Synchronous inserts are also **idempotent**. When using MergeTree engines, Click * The insert succeeded but the client never received an acknowledgment due to a network interruption. * The insert failed server-side and timed out. -In both cases, it's safe to **retry the insert** - as long as the batch contents and order remain identical. For this reason, it's critical that clients retry consistently, without modifying or reordering data. + In both cases, it's safe to **retry the insert** - as long as the batch contents and order remain identical. For this reason, it's critical that clients retry consistently, without modifying or reordering data. ### Choose the right insert target {#choose-the-right-insert-target} @@ -71,11 +70,11 @@ For sharded clusters, you have two options: * Insert directly into a **MergeTree** or **ReplicatedMergeTree** table. This is the most efficient option when the client can perform load balancing across shards. With `internal_replication = true`, ClickHouse handles replication transparently. * Insert into a [Distributed table](/engines/table-engines/special/distributed). This allows clients to send data to any node and let ClickHouse forward it to the correct shard. This is simpler but slightly less performant due to the extra forwarding step. `internal_replication = true` is still recommended. -**In ClickHouse Cloud all nodes read and write to the same single shard. Inserts are automatically balanced across nodes. Users can simply send inserts to the exposed endpoint.** + **In ClickHouse Cloud all nodes read and write to the same single shard. Inserts are automatically balanced across nodes. Users can simply send inserts to the exposed endpoint.** ### Choose the right format {#choose-the-right-format} -Choosing the right input format is crucial for efficient data ingestion in ClickHouse. With over 70 supported formats, selecting the most performant option can significantly impact insert speed, CPU and memory usage, and overall system efficiency. +Choosing the right input format is crucial for efficient data ingestion in ClickHouse. With over 70 supported formats, selecting the most performant option can significantly impact insert speed, CPU and memory usage, and overall system efficiency. While flexibility is useful for data engineering and file-based imports, **applications should prioritize performance-oriented formats**: @@ -98,11 +97,11 @@ ClickHouse supports several compression codecs during data transmission. Two com * **LZ4**: Fast and lightweight. It reduces data size significantly with minimal CPU overhead, making it ideal for high-throughput inserts and default in most ClickHouse clients. * **ZSTD**: Higher compression ratio but more CPU-intensive. It's useful when network transfer costs are high—such as in cross-region or cloud provider scenarios—though it increases client-side compute and server-side decompression time slightly. -Best practice: Use LZ4 unless you have constrained bandwidth or incur data egress costs - then consider ZSTD. + Best practice: Use LZ4 unless you have constrained bandwidth or incur data egress costs - then consider ZSTD. -:::note -In tests from the [FastFormats benchmark](https://clickhouse.com/blog/clickhouse-input-format-matchup-which-is-fastest-most-efficient), LZ4-compressed Native inserts reduced data size by more than 50%, cutting ingestion time from 150s to 131s for a 5.6 GiB dataset. Switching to ZSTD compressed the same dataset down to 1.69 GiB, but increased server-side processing time slightly. -::: + :::note + In tests from the [FastFormats benchmark](https://clickhouse.com/blog/clickhouse-input-format-matchup-which-is-fastest-most-efficient), LZ4-compressed Native inserts reduced data size by more than 50%, cutting ingestion time from 150s to 131s for a 5.6 GiB dataset. Switching to ZSTD compressed the same dataset down to 1.69 GiB, but increased server-side processing time slightly. + ::: #### Compression reduces resource usage {#compression-reduces-resource-usage} @@ -118,11 +117,11 @@ With the [HTTP interface](/interfaces/http), use the Content-Encoding header to ### Pre-sort if low cost {#pre-sort-if-low-cost} -Pre-sorting data by primary key before insertion can improve ingestion efficiency in ClickHouse, particularly for large batches. +Pre-sorting data by primary key before insertion can improve ingestion efficiency in ClickHouse, particularly for large batches. When data arrives pre-sorted, ClickHouse can skip or simplify the internal sorting step during part creation, reducing CPU usage and accelerating the insert process. Pre-sorting also improves compression efficiency, since similar values are grouped together - enabling codecs like LZ4 or ZSTD to achieve a better compression ratio. This is especially beneficial when combined with large batch inserts and compression, as it reduces both the processing overhead and the amount of data transferred. -**That said, pre-sorting is an optional optimization—not a requirement.** ClickHouse sorts data highly efficiently using parallel processing, and in many cases, server-side sorting is faster or more convenient than pre-sorting client-side. +**That said, pre-sorting is an optional optimization—not a requirement.** ClickHouse sorts data highly efficiently using parallel processing, and in many cases, server-side sorting is faster or more convenient than pre-sorting client-side. **We recommend pre-sorting only if the data is already nearly ordered or if client-side resources (CPU, memory) are sufficient and underutilized.** In latency-sensitive or high-throughput use cases, such as observability, where data arrives out of order or from many agents, it's often better to skip pre-sorting and rely on ClickHouse's built-in performance. @@ -134,19 +133,16 @@ When data arrives pre-sorted, ClickHouse can skip or simplify the internal sorti ### Native {#choose-an-interface-native} -ClickHouse offers two main interfaces for data ingestion: the **native interface** and the **HTTP interface** - each with trade-offs between performance and flexibility. The native interface, used by [clickhouse-client](/interfaces/cli) and select language clients like Go and C++, is purpose-built for performance. It always transmits data in ClickHouse's highly efficient Native format, supports block-wise compression with LZ4 or ZSTD, and minimizes server-side processing by offloading work such as parsing and format conversion to the client. +ClickHouse offers two main interfaces for data ingestion: the **native interface** and the **HTTP interface** - each with trade-offs between performance and flexibility. The native interface, used by [clickhouse-client](/interfaces/cli) and select language clients like Go and C++, is purpose-built for performance. It always transmits data in ClickHouse's highly efficient Native format, supports block-wise compression with LZ4 or ZSTD, and minimizes server-side processing by offloading work such as parsing and format conversion to the client. It even enables client-side computation of MATERIALIZED and DEFAULT column values, allowing the server to skip these steps entirely. This makes the native interface ideal for high-throughput ingestion scenarios where efficiency is critical. ### HTTP {#choose-an-interface-http} -Unlike many traditional databases, ClickHouse also supports an HTTP interface. **This, by contrast, prioritizes compatibility and flexibility.** It allows data to be sent in [any supported format](/integrations/data-formats) - including JSON, CSV, Parquet, and others - and is widely supported across most ClickHouse clients, including Python, Java, JavaScript, and Rust. +Unlike many traditional databases, ClickHouse also supports an HTTP interface. **This, by contrast, prioritizes compatibility and flexibility.** It allows data to be sent in [any supported format](/integrations/data-formats) - including JSON, CSV, Parquet, and others - and is widely supported across most ClickHouse clients, including Python, Java, JavaScript, and Rust. This is often preferable to ClickHouse's native protocol as it allows traffic to be easily switched with load balancers. We expect small differences in insert performance with the native protocol, which incurs a little less overhead. However, it lacks the native protocol's deeper integration and cannot perform client-side optimizations like materialized value computation or automatic conversion to Native format. While HTTP inserts can still be compressed using standard HTTP headers (e.g. `Content-Encoding: lz4`), the compression is applied to the entire payload rather than individual data blocks. This interface is often preferred in environments where protocol simplicity, load balancing, or broad format compatibility is more important than raw performance. For a more detailed description of these interfaces see [here](/interfaces/overview). - - - diff --git a/docs/best-practices/sizing-and-hardware-recommendations.md b/docs/best-practices/sizing-and-hardware-recommendations.md index 037c0ac6398..4465607b8e4 100644 --- a/docs/best-practices/sizing-and-hardware-recommendations.md +++ b/docs/best-practices/sizing-and-hardware-recommendations.md @@ -65,7 +65,7 @@ The number of CPUs you should use depends on your workload. However, we generall - **[R-type](https://aws.amazon.com/ec2/instance-types/#Memory_Optimized) (data warehousing use cases):** 8:1 memory to CPU core ratio - **[C-type](https://aws.amazon.com/ec2/instance-types/#Compute_Optimized) (compute-optimized use cases):** 2:1 memory to CPU core ratio -As an example, when using M-type CPUs, we recommend provisioning 100GB of memory per 25 CPU cores. To determine the amount of memory appropriate for your application, profiling your memory usage is necessary. You can read [this guide on debugging memory issues](/guides/developer/debugging-memory-issues) or use the [built-in observability dashboard](/operations/monitoring) to monitor ClickHouse. + As an example, when using M-type CPUs, we recommend provisioning 100GB of memory per 25 CPU cores. To determine the amount of memory appropriate for your application, profiling your memory usage is necessary. You can read [this guide on debugging memory issues](/guides/developer/debugging-memory-issues) or use the [built-in observability dashboard](/operations/monitoring) to monitor ClickHouse. ## Memory {#memory} @@ -75,8 +75,8 @@ The required volume of RAM generally depends on: - The complexity of queries. - The amount of data that is processed in queries. -In general, however, the more memory you have, the faster your queries will run. -If your use case is sensitive to price, lower amounts of memory will work as it is possible to enable settings ([`max_bytes_before_external_group_by`](/operations/settings/settings#max_bytes_before_external_group_by) and [`max_bytes_before_external_sort`](/operations/settings/settings#max_bytes_before_external_sort)) to allow spilling data to disk, but note that this may significantly affect query performance. + In general, however, the more memory you have, the faster your queries will run. + If your use case is sensitive to price, lower amounts of memory will work as it is possible to enable settings ([`max_bytes_before_external_group_by`](/operations/settings/settings#max_bytes_before_external_group_by) and [`max_bytes_before_external_sort`](/operations/settings/settings#max_bytes_before_external_sort)) to allow spilling data to disk, but note that this may significantly affect query performance. ### What should the memory to storage ratio be? {#what-should-the-memory-to-storage-ratio-be} diff --git a/docs/best-practices/use_materialized_views.md b/docs/best-practices/use_materialized_views.md index ed49b160a94..6c3b02bae3e 100644 --- a/docs/best-practices/use_materialized_views.md +++ b/docs/best-practices/use_materialized_views.md @@ -12,12 +12,10 @@ import Image from '@theme/IdealImage'; import incremental_materialized_view from '@site/static/images/bestpractices/incremental_materialized_view.gif'; import refreshable_materialized_view from '@site/static/images/bestpractices/refreshable_materialized_view.gif'; - ClickHouse supports two types of materialized views: [**incremental**](/materialized-view/incremental-materialized-view) and [**refreshable**](/materialized-view/refreshable-materialized-view). While both are designed to accelerate queries by pre-computing and storing results, they differ significantly in how and when the underlying queries are executed, what workloads they are suited for, and how data freshness is handled. **Users should consider materialized views for specific query patterns which need to be accelerated, assuming previous best practices [regarding type](/best-practices/select-data-types) and [primary key optimization](/best-practices/choosing-a-primary-key) have been performed.** - **Incremental materialized views** are updated in real-time. As new data is inserted into the source table, ClickHouse automatically applies the materialized view's query to the new data block and writes the results to a separate target table. Over time, ClickHouse merges these partial results to produce a complete, up-to-date view. This approach is highly efficient because it shifts the computational cost to insert time and only processes new data. As a result, `SELECT` queries against the target table are fast and lightweight. Incremental views support all aggregation functions and scale well—even to petabytes of data—because each query operates on a small, recent subset of the dataset being inserted. Materialized Views @@ -38,15 +36,15 @@ Use incremental materialized views when: - You're aggregating or filtering large volumes of data frequently. - Your queries involve straightforward transformations or aggregations on single tables. -For examples of incremental materialized views see [here](/materialized-view/incremental-materialized-view). + For examples of incremental materialized views see [here](/materialized-view/incremental-materialized-view). ## When to use refreshable materialized views {#when-to-use-refreshable-materialized-views} -Refreshable materialized views execute their queries periodically rather than incrementally, storing the query result set for rapid retrieval. +Refreshable materialized views execute their queries periodically rather than incrementally, storing the query result set for rapid retrieval. -They are most useful when query performance is critical (e.g. sub-millisecond latency) and slightly stale results are acceptable. Since the query is re-run in full, refreshable views are best suited to queries that are either relatively fast to compute or which can be computed at infrequent intervals (e.g. hourly), such as caching “top N” results or lookup tables. +They are most useful when query performance is critical (e.g. sub-millisecond latency) and slightly stale results are acceptable. Since the query is re-run in full, refreshable views are best suited to queries that are either relatively fast to compute or which can be computed at infrequent intervals (e.g. hourly), such as caching “top N” results or lookup tables. -Execution frequency should be tuned carefully to avoid excessive load on the system. Extremely complex queries which consume significant resources should be scheduled cautiously - these can cause overall cluster performance to degrade by impacting caches and consuming CPU and memory. The query should run relatively quickly compared to the refresh interval to avoid overloading your cluster. For example, do not schedule a view to be updated every 10 seconds if the query itself takes at least 10 seconds to compute. +Execution frequency should be tuned carefully to avoid excessive load on the system. Extremely complex queries which consume significant resources should be scheduled cautiously - these can cause overall cluster performance to degrade by impacting caches and consuming CPU and memory. The query should run relatively quickly compared to the refresh interval to avoid overloading your cluster. For example, do not schedule a view to be updated every 10 seconds if the query itself takes at least 10 seconds to compute. ## Summary {#summary} @@ -58,7 +56,7 @@ In summary, use refreshable materialized views when: - You're performing complex joins or denormalization involving multiple tables, requiring updates whenever any source table changes. - You're building batch workflows, denormalization tasks, or creating view dependencies similar to DBT DAGs. -For examples of refreshable materialized views see [here](/materialized-view/refreshable-materialized-view). + For examples of refreshable materialized views see [here](/materialized-view/refreshable-materialized-view). ### APPEND vs REPLACE mode {#append-vs-replace-mode} @@ -74,11 +72,10 @@ Choose `APPEND` mode when: - You're building periodic snapshots or reports. - You need to incrementally collect refreshed results over time. -Choose `REPLACE` mode when: + Choose `REPLACE` mode when: - You only need the most recent result. - Stale data should be discarded entirely. - The view represents a current state or lookup. -Users can find an application of the `APPEND` functionality if building a [Medallion architecture](https://clickhouse.com/blog/building-a-medallion-architecture-for-bluesky-json-data-with-clickhouse). - + Users can find an application of the `APPEND` functionality if building a [Medallion architecture](https://clickhouse.com/blog/building-a-medallion-architecture-for-bluesky-json-data-with-clickhouse). diff --git a/docs/best-practices/using_data_skipping_indices.md b/docs/best-practices/using_data_skipping_indices.md index 53a9ef01949..77156ef4a1c 100644 --- a/docs/best-practices/using_data_skipping_indices.md +++ b/docs/best-practices/using_data_skipping_indices.md @@ -27,26 +27,26 @@ There are several types of data skipping indexes, each suited to different types * **bloom_filter**: Probabilistically determines if a value exists in a block, allowing fast approximate filtering for set membership. Effective for optimizing queries looking for the “needle in a haystack”, where a positive match is needed. * **tokenbf_v1 / ngrambf_v1**: Specialized Bloom filter variants designed for searching tokens or character sequences in strings - particularly useful for log data or text search use cases. -While powerful, skip indexes must be used with care. They only provide benefit when they eliminate a meaningful number of data blocks, and can actually introduce overhead if the query or data structure doesn't align. If even a single matching value exists in a block, that entire block must still be read. + While powerful, skip indexes must be used with care. They only provide benefit when they eliminate a meaningful number of data blocks, and can actually introduce overhead if the query or data structure doesn't align. If even a single matching value exists in a block, that entire block must still be read. -**Effective skip index usage often depends on a strong correlation between the indexed column and the table's primary key, or inserting data in a way that groups similar values together.** + **Effective skip index usage often depends on a strong correlation between the indexed column and the table's primary key, or inserting data in a way that groups similar values together.** -In general, data skipping indices are best applied after ensuring proper primary key design and type optimization. They are particularly useful for: + In general, data skipping indices are best applied after ensuring proper primary key design and type optimization. They are particularly useful for: * Columns with high overall cardinality but low cardinality within a block. * Rare values that are critical for search (e.g. error codes, specific IDs). * Cases where filtering occurs on non-primary key columns with localized distribution. -Always: + Always: 1. test skip indexes on real data with realistic queries. Try different index types and granularity values. -2. Evaluate their impact using tools like send_logs_level='trace' and `EXPLAIN indexes=1` to view index effectiveness. +2. Evaluate their impact using tools like send_logs_level='trace' and `EXPLAIN indexes=1` to view index effectiveness. 3. Always evaluate the size of an index and how it is impacted by granularity. Reducing granularity size often will improve performance to a point, resulting in more granules being filtered and needing to be scanned. However, as index size increases with lower granularity performance can also degrade. Measure the performance and index size for various granularity data points. This is particularly pertinent on bloom filter indexes. -

-**When used appropriately, skip indexes can provide a substantial performance boost - when used blindly, they can add unnecessary cost.** +

+ **When used appropriately, skip indexes can provide a substantial performance boost - when used blindly, they can add unnecessary cost.** -For a more detailed guide on Data Skipping Indices see [here](/sql-reference/statements/alter/skipping-index). + For a more detailed guide on Data Skipping Indices see [here](/sql-reference/statements/alter/skipping-index). ## Example {#example} diff --git a/docs/chdb/getting-started.md b/docs/chdb/getting-started.md index f77a8681bd4..2293e900bd6 100644 --- a/docs/chdb/getting-started.md +++ b/docs/chdb/getting-started.md @@ -10,7 +10,7 @@ keywords: ['chdb', 'embedded', 'clickhouse-lite', 'in-process', 'in process'] In this guide, we're going to get up and running with the Python variant of chDB. We'll start by querying a JSON file on S3, before creating a table in chDB based on the JSON file, and doing some queries on the data. -We'll also see how to have queries return data in different formats, including Apache Arrow and Panda, and finally we'll learn how to query Pandas DataFrames. +We'll also see how to have queries return data in different formats, including Apache Arrow and Panda, and finally we'll learn how to query Pandas DataFrames. ## Setup {#setup} @@ -48,7 +48,7 @@ pip install pandas pyarrow ## Querying a JSON file in S3 {#querying-a-json-file-in-s3} -Let's now have a look at how to query a JSON file that's stored in an S3 bucket. +Let's now have a look at how to query a JSON file that's stored in an S3 bucket. The [YouTube dislikes dataset](/getting-started/example-datasets/youtube-dislikes) contains more than 4 billion rows of dislikes on YouTube videos up to 2021. We're going to work with one of the JSON files from that dataset. @@ -106,7 +106,6 @@ chdb.query( We can also count the number of rows in that file: - ```python chdb.query( """ @@ -146,7 +145,7 @@ This is fine to do with variables defined in your program, but don't do it with ## Configuring the output format {#configuring-the-output-format} -The default output format is `CSV`, but we can change that via the `output_format` parameter. +The default output format is `CSV`, but we can change that via the `output_format` parameter. chDB supports the ClickHouse data formats, as well as [some of its own](/chdb/reference/data-formats.md), including `DataFrame`, which returns a Pandas DataFrame: ```python @@ -198,7 +197,7 @@ count(): [[315746,20686]] ## Creating a table from JSON file {#creating-a-table-from-json-file} -Next, let's have a look at how to create a table in chDB. +Next, let's have a look at how to create a table in chDB. We need to use a different API to do that, so let's first import that: ```python @@ -225,9 +224,9 @@ We'll use the [`schema_inference_make_columns_nullable`](/operations/settings/fo ```python sess.query(f""" CREATE TABLE youtube.dislikes - ORDER BY fetch_date - EMPTY AS - SELECT * + ORDER BY fetch_date + EMPTY AS + SELECT * FROM s3('{path}','JSONLines') SETTINGS schema_inference_make_columns_nullable=0 """ @@ -236,7 +235,6 @@ sess.query(f""" We can then use the `DESCRIBE` clause to inspect the schema: - ```python sess.query(f""" DESCRIBE youtube.dislikes @@ -276,13 +274,12 @@ sess.query(f""" "video_badges","String" ``` - Next, let's populate that table: ```python sess.query(f""" INSERT INTO youtube.dislikes - SELECT * + SELECT * FROM s3('{path}','JSONLines') SETTINGS schema_inference_make_columns_nullable=0 """ @@ -295,9 +292,9 @@ Let's create a different table using that technique: ```python sess.query(f""" CREATE TABLE youtube.dislikes2 - ORDER BY fetch_date - AS - SELECT * + ORDER BY fetch_date + AS + SELECT * FROM s3('{path}','JSONLines') SETTINGS schema_inference_make_columns_nullable=0 """ @@ -374,7 +371,7 @@ You can also read more about querying Pandas DataFrames in the [Querying Pandas ## Next steps {#next-steps} -Hopefully, this guide has given you a good overview of chDB. +Hopefully, this guide has given you a good overview of chDB. To learn more about how to use it, see the following developer guides: * [Querying Pandas DataFrames](guides/querying-pandas.md) diff --git a/docs/chdb/guides/clickhouse-local.md b/docs/chdb/guides/clickhouse-local.md index 4fd2ae78354..61e200ef8c6 100644 --- a/docs/chdb/guides/clickhouse-local.md +++ b/docs/chdb/guides/clickhouse-local.md @@ -117,7 +117,6 @@ quants: [0,9976599,2147776478,4209286886] We can also insert data into this database from chDB: - ```python sess.query(""" INSERT INTO foo.randomNumbers diff --git a/docs/chdb/guides/index.md b/docs/chdb/guides/index.md index f276e625a2e..d09df375f57 100644 --- a/docs/chdb/guides/index.md +++ b/docs/chdb/guides/index.md @@ -7,9 +7,9 @@ keywords: ['chdb', 'guides'] Take a look at our chDB developer guides below: - diff --git a/docs/chdb/guides/jupysql.md b/docs/chdb/guides/jupysql.md index 2a71d1c3400..e9030b23076 100644 --- a/docs/chdb/guides/jupysql.md +++ b/docs/chdb/guides/jupysql.md @@ -99,7 +99,6 @@ Next, we'll display the display limit so that results of queries won't be trunca We've downloaded a bunch of files with the `atp_rankings` prefix. Let's use the `DESCRIBE` clause to understand the schema: - ```python %%sql DESCRIBE file('atp_rankings*.csv') @@ -213,7 +212,6 @@ Looks good - the output, as expected, is the same as when querying the CSV files We're going to follow the same process for the player metadata. This time the data is all in a single CSV file, so let's download that file: - ```python _ = urlretrieve( f"{base}/atp_players.csv", @@ -244,7 +242,6 @@ SETTINGS schema_inference_make_columns_nullable=0 Once that's finished running, we can have a look at the data we've ingested: - ```python %sql SELECT * FROM atp.players LIMIT 10 ``` @@ -412,7 +409,6 @@ WHERE rank <= 100 We can then create a histogram by running the following: - ```python from sql.ggplot import ggplot, geom_histogram, aes @@ -426,4 +422,3 @@ plot = ( ``` Histogram of player rankings in ATP dataset - diff --git a/docs/chdb/guides/query-remote-clickhouse.md b/docs/chdb/guides/query-remote-clickhouse.md index 5d78c944e3d..6df7e7f01cf 100644 --- a/docs/chdb/guides/query-remote-clickhouse.md +++ b/docs/chdb/guides/query-remote-clickhouse.md @@ -58,15 +58,15 @@ We're going to query ClickPy using the `remoteSecure` function. This function takes in a host name, table name, and username at a minimum. We can write the following query to return the number of downloads per day of the [`openai` package](https://clickpy.clickhouse.com/dashboard/openai) as a Pandas DataFrame: - + ```python query = """ SELECT toStartOfDay(date)::Date32 AS x, sum(count) AS y FROM remoteSecure( - 'clickpy-clickhouse.clickhouse.com', - 'pypi.pypi_downloads_per_day', + 'clickpy-clickhouse.clickhouse.com', + 'pypi.pypi_downloads_per_day', 'play' ) WHERE project = 'openai' @@ -94,15 +94,14 @@ openai_df.sort_values(by=["x"], ascending=False).head(n=10) Now let's do the same to return the downloads for [`scikit-learn`](https://clickpy.clickhouse.com/dashboard/scikit-learn): - ```python query = """ SELECT toStartOfDay(date)::Date32 AS x, sum(count) AS y FROM remoteSecure( - 'clickpy-clickhouse.clickhouse.com', - 'pypi.pypi_downloads_per_day', + 'clickpy-clickhouse.clickhouse.com', + 'pypi.pypi_downloads_per_day', 'play' ) WHERE project = 'scikit-learn' @@ -134,8 +133,8 @@ We now have two DataFrames, which we can merge together based on date (which is ```python df = openai_df.merge( - sklearn_df, - on="x", + sklearn_df, + on="x", suffixes=("_openai", "_sklearn") ) df.head(n=5) @@ -168,7 +167,7 @@ df.head(n=5) ## Querying Pandas DataFrames {#querying-pandas-dataframes} -Next, let's say we want to find the dates with the best and worst ratios. +Next, let's say we want to find the dates with the best and worst ratios. We can go back to chDB and compute those values: ```python diff --git a/docs/chdb/guides/querying-apache-arrow.md b/docs/chdb/guides/querying-apache-arrow.md index 688cc289ed4..cf88f5d9f7d 100644 --- a/docs/chdb/guides/querying-apache-arrow.md +++ b/docs/chdb/guides/querying-apache-arrow.md @@ -53,8 +53,6 @@ aws s3 cp \ If you want to download more files, use `aws s3 ls` to get a list of all the files and then update the above command. ::: - - Next, we'll import the Parquet module from the `pyarrow` package: ```python @@ -141,10 +139,9 @@ chdb.query("SELECT count() FROM Python(arrow_table)", "DataFrame") 0 3864546 ``` -Now, let's do something a bit more interesting. +Now, let's do something a bit more interesting. The following query excludes the `quadkey` and `tile.*` columns and then computes the average and max values for all remaining column: - ```python chdb.query(""" WITH numericColumns AS ( diff --git a/docs/chdb/guides/querying-pandas.md b/docs/chdb/guides/querying-pandas.md index faa78627ac2..5b2e89ded88 100644 --- a/docs/chdb/guides/querying-pandas.md +++ b/docs/chdb/guides/querying-pandas.md @@ -156,7 +156,7 @@ Name: 0, dtype: object ## Querying Pandas DataFrames {#querying-pandas-dataframes} -Next, let's see how to query these DataFrames using chDB. +Next, let's see how to query these DataFrames using chDB. We'll import the library: ```python @@ -337,7 +337,7 @@ Then, create an `events` table based on `events_df`: ```python sess.query(""" CREATE TABLE statsbomb.events ORDER BY id AS -SELECT * +SELECT * FROM Python(events_df) """) ``` diff --git a/docs/chdb/guides/querying-parquet.md b/docs/chdb/guides/querying-parquet.md index ecc63923236..68273a98afd 100644 --- a/docs/chdb/guides/querying-parquet.md +++ b/docs/chdb/guides/querying-parquet.md @@ -54,7 +54,7 @@ Let's use the `DESCRIBE` clause to see the fields returned when we use this form ```python query = """ DESCRIBE s3( - 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_2015.snappy.parquet', + 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_2015.snappy.parquet', ParquetMetadata ) SETTINGS describe_compact_output=1 @@ -82,7 +82,7 @@ Let's have now have a look at the metadata for this file. query = """ SELECT * EXCEPT(columns, row_groups) FROM s3( - 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_2015.snappy.parquet', + 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_2015.snappy.parquet', ParquetMetadata ) """ @@ -119,7 +119,7 @@ WITH rowGroups AS ( ARRAY JOIN row_groups AS rg LIMIT 1 ) -SELECT tupleElement(c, 'name') AS name, tupleElement(c, 'total_compressed_size') AS total_compressed_size, +SELECT tupleElement(c, 'name') AS name, tupleElement(c, 'total_compressed_size') AS total_compressed_size, tupleElement(c, 'total_uncompressed_size') AS total_uncompressed_size, tupleElement(tupleElement(c, 'statistics'), 'min') AS min, tupleElement(tupleElement(c, 'statistics'), 'max') AS max diff --git a/docs/chdb/guides/querying-s3-bucket.md b/docs/chdb/guides/querying-s3-bucket.md index 101f24e491b..0fcb198e2cb 100644 --- a/docs/chdb/guides/querying-s3-bucket.md +++ b/docs/chdb/guides/querying-s3-bucket.md @@ -87,7 +87,7 @@ chdb.query(""" SELECT _file, count() AS count, - formatReadableQuantity(count) AS readableCount + formatReadableQuantity(count) AS readableCount FROM s3('s3://datasets-documentation/amazon_reviews/*.parquet') GROUP BY ALL SETTINGS output_format_pretty_row_numbers=0 @@ -114,7 +114,7 @@ chdb.query(""" SELECT _file, count() AS count, - formatReadableQuantity(count) AS readableCount + formatReadableQuantity(count) AS readableCount FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/*.parquet') GROUP BY ALL SETTINGS output_format_pretty_row_numbers=0 @@ -148,33 +148,33 @@ SETTINGS describe_compact_output=1 14. │ review_headline │ Nullable(String) │ 15. │ review_body │ Nullable(String) │ └───────────────────┴──────────────────┘ -``` + ``` -Let's now compute the top product categories based on number of reviews, as well as computing the average star rating: + Let's now compute the top product categories based on number of reviews, as well as computing the average star rating: -```python -chdb.query(""" -SELECT product_category, count() AS reviews, round(avg(star_rating), 2) as avg -FROM s3('s3://datasets-documentation/amazon_reviews/*.parquet') -GROUP BY ALL -LIMIT 10 -""", 'PrettyCompact') -``` + ```python + chdb.query(""" + SELECT product_category, count() AS reviews, round(avg(star_rating), 2) as avg + FROM s3('s3://datasets-documentation/amazon_reviews/*.parquet') + GROUP BY ALL + LIMIT 10 + """, 'PrettyCompact') + ``` -```text + ```text ┌─product_category─┬──reviews─┬──avg─┐ - 1. │ Toys │ 4864056 │ 4.21 │ - 2. │ Apparel │ 5906085 │ 4.11 │ - 3. │ Luggage │ 348644 │ 4.22 │ - 4. │ Kitchen │ 4880297 │ 4.21 │ - 5. │ Books │ 19530930 │ 4.34 │ - 6. │ Outdoors │ 2302327 │ 4.24 │ - 7. │ Video │ 380596 │ 4.19 │ - 8. │ Grocery │ 2402365 │ 4.31 │ - 9. │ Shoes │ 4366757 │ 4.24 │ + 1. │ Toys │ 4864056 │ 4.21 │ + 2. │ Apparel │ 5906085 │ 4.11 │ + 3. │ Luggage │ 348644 │ 4.22 │ + 4. │ Kitchen │ 4880297 │ 4.21 │ + 5. │ Books │ 19530930 │ 4.34 │ + 6. │ Outdoors │ 2302327 │ 4.24 │ + 7. │ Video │ 380596 │ 4.19 │ + 8. │ Grocery │ 2402365 │ 4.31 │ + 9. │ Shoes │ 4366757 │ 4.24 │ 10. │ Jewelry │ 1767667 │ 4.14 │ └──────────────────┴──────────┴──────┘ -``` + ``` ## Querying files in a private S3 bucket {#querying-files-in-a-private-s3-bucket} diff --git a/docs/chdb/index.md b/docs/chdb/index.md index cdb54fe6265..c14c519ffb3 100644 --- a/docs/chdb/index.md +++ b/docs/chdb/index.md @@ -37,11 +37,10 @@ chDB supports Parquet, CSV, JSON, Apache Arrow, ORC, and [60+ more formats](/int * [Querying remote ClickHouse](guides/query-remote-clickhouse.md) * [Using clickhouse-local database](guides/clickhouse-local.md) - + chDB lets you + - Supports Python DB API 2.0: [example](https://github.com/chdb-io/chdb/blob/main/examples/dbapi.py) and [custom UDF Functions](https://github.com/chdb-io/chdb/blob/main/examples/udf.py) --> ## An introductory video {#an-introductory-video} @@ -57,7 +56,6 @@ You can listen to a brief project introduction to chDB, courtesy of Alexey Milov - Read about chDB and its use cases on the [Official ClickHouse Blog](https://clickhouse.com/blog/welcome-chdb-to-clickhouse) - Discover chDB in your browser using [codapi examples](https://antonz.org/trying-chdb/) - ## What license does it use? {#what-license-does-it-use} chDB is available under the Apache License, Version 2.0. diff --git a/docs/chdb/install/bun.md b/docs/chdb/install/bun.md index f8e9e134621..16f2b86740a 100644 --- a/docs/chdb/install/bun.md +++ b/docs/chdb/install/bun.md @@ -53,4 +53,3 @@ console.log(result); sess.cleanup(); // cleanup session, this will delete the database ``` - diff --git a/docs/chdb/install/c.md b/docs/chdb/install/c.md index 9ff218aab17..8d5a914c1ac 100644 --- a/docs/chdb/install/c.md +++ b/docs/chdb/install/c.md @@ -18,7 +18,6 @@ Install [libchdb](https://github.com/chdb-io/chdb): curl -sL https://lib.chdb.io | bash ``` - ## Usage {#usage} Follow the instructions for [libchdb](https://github.com/chdb-io/chdb/blob/main/bindings.md) to get started. diff --git a/docs/chdb/install/nodejs.md b/docs/chdb/install/nodejs.md index 1bef298ba26..63aa51da075 100644 --- a/docs/chdb/install/nodejs.md +++ b/docs/chdb/install/nodejs.md @@ -26,7 +26,6 @@ npm i chdb You can find the GitHub repository for the project at [chdb-io/chdb-node](https://github.com/chdb-io/chdb-node). - ## Usage {#usage} You can leverage the power of chdb in your NodeJS applications by importing and using the chdb-node module: @@ -64,4 +63,3 @@ npm run libchdb npm install npm run test ``` - diff --git a/docs/chdb/install/python.md b/docs/chdb/install/python.md index 446251f7c2c..32f538e0797 100644 --- a/docs/chdb/install/python.md +++ b/docs/chdb/install/python.md @@ -167,7 +167,7 @@ Some notes on the chDB Python UDF (User Defined Function) decorator. ``` 6. The Python interpreter used is the same as the one used to run the script. You can get it from `sys.executable`. -see also: [test_udf.py](https://github.com/chdb-io/chdb/blob/main/tests/test_udf.py). + see also: [test_udf.py](https://github.com/chdb-io/chdb/blob/main/tests/test_udf.py). ### Python table engine {#python-table-engine} @@ -212,12 +212,10 @@ chdb.query( 1. be stateful, the cursor should be updated in the `read` method. 3. An optional `get_schema` method can be implemented to return the schema of the table. The prototype is `def get_schema(self) -> List[Tuple[str, str]]:`, the return value is a list of tuples, each tuple contains the column name and the column type. The column type should be one of [the following](/sql-reference/data-types). -
- -```python -import chdb + ```python + import chdb -class myReader(chdb.PyReader): + class myReader(chdb.PyReader): def __init__(self, data): self.data = data self.cursor = 0 @@ -231,19 +229,19 @@ class myReader(chdb.PyReader): self.cursor += len(block[0]) return block -reader = myReader( + reader = myReader( { "a": [1, 2, 3, 4, 5, 6], "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"], } -) + ) -chdb.query( + chdb.query( "SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b" -).show() -``` + ).show() + ``` -See also: [test_query_py.py](https://github.com/chdb-io/chdb/blob/main/tests/test_query_py.py). + See also: [test_query_py.py](https://github.com/chdb-io/chdb/blob/main/tests/test_query_py.py). ## Limitations {#limitations} @@ -252,7 +250,4 @@ See also: [test_query_py.py](https://github.com/chdb-io/chdb/blob/main/tests/tes 1. Python Object type will be converted to String 1. Pandas DataFrame performance is all of the best, Arrow Table is better than PyReader -
- -For more examples, see [examples](https://github.com/chdb-io/chdb/tree/main/examples) and [tests](https://github.com/chdb-io/chdb/tree/main/tests). - + For more examples, see [examples](https://github.com/chdb-io/chdb/tree/main/examples) and [tests](https://github.com/chdb-io/chdb/tree/main/tests). diff --git a/docs/chdb/reference/data-formats.md b/docs/chdb/reference/data-formats.md index 789b5cf21cc..043c9d5c5fd 100644 --- a/docs/chdb/reference/data-formats.md +++ b/docs/chdb/reference/data-formats.md @@ -16,85 +16,85 @@ As well as the data formats that ClickHouse supports, chDB also supports: - `DataFrame` as an input and output format, the type is Python `pandas.DataFrame`. For examples, see [`test_joindf.py`](https://github.com/chdb-io/chdb/blob/main/tests/test_joindf.py) - `Debug` as ab output (as an alias of `CSV`), but with enabled debug verbose output from ClickHouse. -The supported data formats from ClickHouse are: + The supported data formats from ClickHouse are: -| Format | Input | Output | -|---------------------------------|-------|--------| -| TabSeparated | ✔ | ✔ | -| TabSeparatedRaw | ✔ | ✔ | -| TabSeparatedWithNames | ✔ | ✔ | -| TabSeparatedWithNamesAndTypes | ✔ | ✔ | -| TabSeparatedRawWithNames | ✔ | ✔ | -| TabSeparatedRawWithNamesAndTypes| ✔ | ✔ | -| Template | ✔ | ✔ | -| TemplateIgnoreSpaces | ✔ | ✗ | -| CSV | ✔ | ✔ | -| CSVWithNames | ✔ | ✔ | -| CSVWithNamesAndTypes | ✔ | ✔ | -| CustomSeparated | ✔ | ✔ | -| CustomSeparatedWithNames | ✔ | ✔ | -| CustomSeparatedWithNamesAndTypes| ✔ | ✔ | -| SQLInsert | ✗ | ✔ | -| Values | ✔ | ✔ | -| Vertical | ✗ | ✔ | -| JSON | ✔ | ✔ | -| JSONAsString | ✔ | ✗ | -| JSONStrings | ✔ | ✔ | -| JSONColumns | ✔ | ✔ | -| JSONColumnsWithMetadata | ✔ | ✔ | -| JSONCompact | ✔ | ✔ | -| JSONCompactStrings | ✗ | ✔ | -| JSONCompactColumns | ✔ | ✔ | -| JSONEachRow | ✔ | ✔ | -| PrettyJSONEachRow | ✗ | ✔ | -| JSONEachRowWithProgress | ✗ | ✔ | -| JSONStringsEachRow | ✔ | ✔ | -| JSONStringsEachRowWithProgress | ✗ | ✔ | -| JSONCompactEachRow | ✔ | ✔ | -| JSONCompactEachRowWithNames | ✔ | ✔ | -| JSONCompactEachRowWithNamesAndTypes | ✔ | ✔ | -| JSONCompactStringsEachRow | ✔ | ✔ | -| JSONCompactStringsEachRowWithNames | ✔ | ✔ | -| JSONCompactStringsEachRowWithNamesAndTypes | ✔ | ✔ | -| JSONObjectEachRow | ✔ | ✔ | -| BSONEachRow | ✔ | ✔ | -| TSKV | ✔ | ✔ | -| Pretty | ✗ | ✔ | -| PrettyNoEscapes | ✗ | ✔ | -| PrettyMonoBlock | ✗ | ✔ | -| PrettyNoEscapesMonoBlock | ✗ | ✔ | -| PrettyCompact | ✗ | ✔ | -| PrettyCompactNoEscapes | ✗ | ✔ | -| PrettyCompactMonoBlock | ✗ | ✔ | -| PrettyCompactNoEscapesMonoBlock | ✗ | ✔ | -| PrettySpace | ✗ | ✔ | -| PrettySpaceNoEscapes | ✗ | ✔ | -| PrettySpaceMonoBlock | ✗ | ✔ | -| PrettySpaceNoEscapesMonoBlock | ✗ | ✔ | -| Prometheus | ✗ | ✔ | -| Protobuf | ✔ | ✔ | -| ProtobufSingle | ✔ | ✔ | -| Avro | ✔ | ✔ | -| AvroConfluent | ✔ | ✗ | -| Parquet | ✔ | ✔ | -| ParquetMetadata | ✔ | ✗ | -| Arrow | ✔ | ✔ | -| ArrowStream | ✔ | ✔ | -| ORC | ✔ | ✔ | -| One | ✔ | ✗ | -| RowBinary | ✔ | ✔ | -| RowBinaryWithNames | ✔ | ✔ | -| RowBinaryWithNamesAndTypes | ✔ | ✔ | -| RowBinaryWithDefaults | ✔ | ✔ | -| Native | ✔ | ✔ | -| Null | ✗ | ✔ | -| XML | ✗ | ✔ | -| CapnProto | ✔ | ✔ | -| LineAsString | ✔ | ✔ | -| Regexp | ✔ | ✗ | -| RawBLOB | ✔ | ✔ | -| MsgPack | ✔ | ✔ | -| MySQLDump | ✔ | ✗ | -| Markdown | ✗ | ✔ | + | Format | Input | Output | + |---------------------------------|-------|--------| + | TabSeparated | ✔ | ✔ | + | TabSeparatedRaw | ✔ | ✔ | + | TabSeparatedWithNames | ✔ | ✔ | + | TabSeparatedWithNamesAndTypes | ✔ | ✔ | + | TabSeparatedRawWithNames | ✔ | ✔ | + | TabSeparatedRawWithNamesAndTypes| ✔ | ✔ | + | Template | ✔ | ✔ | + | TemplateIgnoreSpaces | ✔ | ✗ | + | CSV | ✔ | ✔ | + | CSVWithNames | ✔ | ✔ | + | CSVWithNamesAndTypes | ✔ | ✔ | + | CustomSeparated | ✔ | ✔ | + | CustomSeparatedWithNames | ✔ | ✔ | + | CustomSeparatedWithNamesAndTypes| ✔ | ✔ | + | SQLInsert | ✗ | ✔ | + | Values | ✔ | ✔ | + | Vertical | ✗ | ✔ | + | JSON | ✔ | ✔ | + | JSONAsString | ✔ | ✗ | + | JSONStrings | ✔ | ✔ | + | JSONColumns | ✔ | ✔ | + | JSONColumnsWithMetadata | ✔ | ✔ | + | JSONCompact | ✔ | ✔ | + | JSONCompactStrings | ✗ | ✔ | + | JSONCompactColumns | ✔ | ✔ | + | JSONEachRow | ✔ | ✔ | + | PrettyJSONEachRow | ✗ | ✔ | + | JSONEachRowWithProgress | ✗ | ✔ | + | JSONStringsEachRow | ✔ | ✔ | + | JSONStringsEachRowWithProgress | ✗ | ✔ | + | JSONCompactEachRow | ✔ | ✔ | + | JSONCompactEachRowWithNames | ✔ | ✔ | + | JSONCompactEachRowWithNamesAndTypes | ✔ | ✔ | + | JSONCompactStringsEachRow | ✔ | ✔ | + | JSONCompactStringsEachRowWithNames | ✔ | ✔ | + | JSONCompactStringsEachRowWithNamesAndTypes | ✔ | ✔ | + | JSONObjectEachRow | ✔ | ✔ | + | BSONEachRow | ✔ | ✔ | + | TSKV | ✔ | ✔ | + | Pretty | ✗ | ✔ | + | PrettyNoEscapes | ✗ | ✔ | + | PrettyMonoBlock | ✗ | ✔ | + | PrettyNoEscapesMonoBlock | ✗ | ✔ | + | PrettyCompact | ✗ | ✔ | + | PrettyCompactNoEscapes | ✗ | ✔ | + | PrettyCompactMonoBlock | ✗ | ✔ | + | PrettyCompactNoEscapesMonoBlock | ✗ | ✔ | + | PrettySpace | ✗ | ✔ | + | PrettySpaceNoEscapes | ✗ | ✔ | + | PrettySpaceMonoBlock | ✗ | ✔ | + | PrettySpaceNoEscapesMonoBlock | ✗ | ✔ | + | Prometheus | ✗ | ✔ | + | Protobuf | ✔ | ✔ | + | ProtobufSingle | ✔ | ✔ | + | Avro | ✔ | ✔ | + | AvroConfluent | ✔ | ✗ | + | Parquet | ✔ | ✔ | + | ParquetMetadata | ✔ | ✗ | + | Arrow | ✔ | ✔ | + | ArrowStream | ✔ | ✔ | + | ORC | ✔ | ✔ | + | One | ✔ | ✗ | + | RowBinary | ✔ | ✔ | + | RowBinaryWithNames | ✔ | ✔ | + | RowBinaryWithNamesAndTypes | ✔ | ✔ | + | RowBinaryWithDefaults | ✔ | ✔ | + | Native | ✔ | ✔ | + | Null | ✗ | ✔ | + | XML | ✗ | ✔ | + | CapnProto | ✔ | ✔ | + | LineAsString | ✔ | ✔ | + | Regexp | ✔ | ✗ | + | RawBLOB | ✔ | ✔ | + | MsgPack | ✔ | ✔ | + | MySQLDump | ✔ | ✗ | + | Markdown | ✗ | ✔ | -For further information and examples, see [ClickHouse formats for input and output data](/interfaces/formats). + For further information and examples, see [ClickHouse formats for input and output data](/interfaces/formats). diff --git a/docs/chdb/reference/sql-reference.md b/docs/chdb/reference/sql-reference.md index 517f51432fa..872f5f1ccea 100644 --- a/docs/chdb/reference/sql-reference.md +++ b/docs/chdb/reference/sql-reference.md @@ -16,7 +16,7 @@ chdb supports the same SQL syntax, statements, engines and functions as ClickHou | [Database Engines](/engines/database-engines) | | [Regular Functions](/sql-reference/functions) | | [Aggregate Functions](/sql-reference/aggregate-functions) | -| [Table Functions](/sql-reference/table-functions) | +| [Table Functions](/sql-reference/table-functions) | | [Window Functions](/sql-reference/window-functions) | For further information and examples, see the [ClickHouse SQL Reference](/sql-reference). diff --git a/docs/cloud/bestpractices/multitenancy.md b/docs/cloud/bestpractices/multitenancy.md index 9f47dc580d1..cd9e0cfe667 100644 --- a/docs/cloud/bestpractices/multitenancy.md +++ b/docs/cloud/bestpractices/multitenancy.md @@ -25,7 +25,7 @@ In cases where there is a significant gap in data volume between tenants, smalle ### Example {#shared-table-example} -This is an example of a shared table multi-tenancy model implementation. +This is an example of a shared table multi-tenancy model implementation. First, let's create a shared table with a field `tenant_id` included in the primary key. @@ -64,12 +64,12 @@ VALUES Then let's create two users `user_1` and `user_2`. ```sql --- Create users +-- Create users CREATE USER user_1 IDENTIFIED BY '' CREATE USER user_2 IDENTIFIED BY '' ``` -We [create row policies](/sql-reference/statements/create/row-policy) that restricts `user_1` and `user_2` to access only their tenants' data. +We [create row policies](/sql-reference/statements/create/row-policy) that restricts `user_1` and `user_2` to access only their tenants' data. ```sql -- Create row policies @@ -77,7 +77,7 @@ CREATE ROW POLICY user_filter_1 ON default.events USING tenant_id=1 TO user_1 CREATE ROW POLICY user_filter_2 ON default.events USING tenant_id=2 TO user_2 ``` -Then [`GRANT SELECT`](/sql-reference/statements/grant#usage) privileges on the shared table using a common role. +Then [`GRANT SELECT`](/sql-reference/statements/grant#usage) privileges on the shared table using a common role. ```sql -- Create role @@ -89,7 +89,7 @@ GRANT user_role TO user_1 GRANT user_role TO user_2 ``` -Now you can connect as `user_1` and run a simple select. Only rows from the first tenant are returned. +Now you can connect as `user_1` and run a simple select. Only rows from the first tenant are returned. ```sql -- Logged as user_1 @@ -102,8 +102,8 @@ FROM events 3. │ 1 │ 6b4d12e4-447d-4398-b3fa-1c1e94d71a2f │ user_logout │ 2025-03-19 08:10:00 │ 1001 │ {"device": "desktop", "location": "LA"} │ 4. │ 1 │ 83b5eb72-aba3-4038-bc52-6c08b6423615 │ purchase │ 2025-03-19 08:45:00 │ 1003 │ {"item": "monitor", "amount": 450} │ 5. │ 1 │ 975fb0c8-55bd-4df4-843b-34f5cfeed0a9 │ user_login │ 2025-03-19 08:50:00 │ 1004 │ {"device": "desktop", "location": "LA"} │ - └───────────┴──────────────────────────────────────┴─────────────┴─────────────────────┴─────────┴─────────────────────────────────────────┘ -``` + └───────────┴──────────────────────────────────────┴─────────────┴─────────────────────┴─────────┴─────────────────────────────────────────┘ + ``` ## Separate tables {#separate-tables} @@ -111,18 +111,18 @@ In this approach, each tenant's data is stored in a separate table within the sa > **Using separate tables is a good choice when tenants have different data schemas.** -For scenarios involving a few tenants with very large datasets where query performance is critical, this approach may outperform a shared table model. Since there is no need to filter out other tenants' data, queries can be more efficient. Additionally, primary keys can be further optimized, as there is no need to include an extra field (such as a tenant ID) in the primary key. +For scenarios involving a few tenants with very large datasets where query performance is critical, this approach may outperform a shared table model. Since there is no need to filter out other tenants' data, queries can be more efficient. Additionally, primary keys can be further optimized, as there is no need to include an extra field (such as a tenant ID) in the primary key. Note this approach doesn't scale for 1000s of tenants. See [usage limits](/cloud/bestpractices/usage-limits). ### Example {#separate-tables-example} -This is an example of a separate tables multi-tenancy model implementation. +This is an example of a separate tables multi-tenancy model implementation. First, let's create two tables, one for events from `tenant_1` and one for the events from `tenant_2`. ```sql --- Create table for tenant 1 +-- Create table for tenant 1 CREATE TABLE events_tenant_1 ( id UUID, -- Unique event ID @@ -133,7 +133,7 @@ CREATE TABLE events_tenant_1 ) ORDER BY (timestamp, user_id) -- Primary key can focus on other attributes --- Create table for tenant 2 +-- Create table for tenant 2 CREATE TABLE events_tenant_2 ( id UUID, -- Unique event ID @@ -168,7 +168,7 @@ VALUES Then let's create two users `user_1` and `user_2`. ```sql --- Create users +-- Create users CREATE USER user_1 IDENTIFIED BY '' CREATE USER user_2 IDENTIFIED BY '' ``` @@ -181,7 +181,7 @@ GRANT SELECT ON default.events_tenant_1 TO user_1 GRANT SELECT ON default.events_tenant_2 TO user_2 ``` -Now you can connect as `user_1` and run a simple select from the table corresponding to this user. Only rows from the first tenant are returned. +Now you can connect as `user_1` and run a simple select from the table corresponding to this user. Only rows from the first tenant are returned. ```sql -- Logged as user_1 @@ -194,8 +194,8 @@ FROM default.events_tenant_1 3. │ 6b4d12e4-447d-4398-b3fa-1c1e94d71a2f │ user_logout │ 2025-03-19 08:10:00 │ 1001 │ {"device": "desktop", "location": "LA"} │ 4. │ 83b5eb72-aba3-4038-bc52-6c08b6423615 │ purchase │ 2025-03-19 08:45:00 │ 1003 │ {"item": "monitor", "amount": 450} │ 5. │ 975fb0c8-55bd-4df4-843b-34f5cfeed0a9 │ user_login │ 2025-03-19 08:50:00 │ 1004 │ {"device": "desktop", "location": "LA"} │ - └──────────────────────────────────────┴─────────────┴─────────────────────┴─────────┴─────────────────────────────────────────┘ -``` + └──────────────────────────────────────┴─────────────┴─────────────────────┴─────────┴─────────────────────────────────────────┘ + ``` ## Separate databases {#separate-databases} @@ -209,7 +209,7 @@ Note this approach doesn't scale for 1000s of tenants. See [usage limits](/cloud ### Example {#separate-databases-example} -This is an example of a separate databases multi-tenancy model implementation. +This is an example of a separate databases multi-tenancy model implementation. First, let's create two databases, one for `tenant_1` and one for `tenant_2`. @@ -268,7 +268,7 @@ VALUES Then let's create two users `user_1` and `user_2`. ```sql --- Create users +-- Create users CREATE USER user_1 IDENTIFIED BY '' CREATE USER user_2 IDENTIFIED BY '' ``` @@ -281,7 +281,7 @@ GRANT SELECT ON tenant_1.events TO user_1 GRANT SELECT ON tenant_2.events TO user_2 ``` -Now you can connect as `user_1` and run a simple select on the events table of the appropriate database. Only rows from the first tenant are returned. +Now you can connect as `user_1` and run a simple select on the events table of the appropriate database. Only rows from the first tenant are returned. ```sql -- Logged as user_1 @@ -294,20 +294,20 @@ FROM tenant_1.events 3. │ 6b4d12e4-447d-4398-b3fa-1c1e94d71a2f │ user_logout │ 2025-03-19 08:10:00 │ 1001 │ {"device": "desktop", "location": "LA"} │ 4. │ 83b5eb72-aba3-4038-bc52-6c08b6423615 │ purchase │ 2025-03-19 08:45:00 │ 1003 │ {"item": "monitor", "amount": 450} │ 5. │ 975fb0c8-55bd-4df4-843b-34f5cfeed0a9 │ user_login │ 2025-03-19 08:50:00 │ 1004 │ {"device": "desktop", "location": "LA"} │ - └──────────────────────────────────────┴─────────────┴─────────────────────┴─────────┴─────────────────────────────────────────┘ -``` + └──────────────────────────────────────┴─────────────┴─────────────────────┴─────────┴─────────────────────────────────────────┘ + ``` ## Compute-compute separation {#compute-compute-separation} -The three approaches described above can also be further isolated by using [Warehouses](/cloud/reference/warehouses#what-is-a-warehouse). Data is shared through a common object storage but each tenant can have its own compute service thanks to [compute-compute separation](/cloud/reference/warehouses#what-is-compute-compute-separation) with different CPU/Memory ratio. +The three approaches described above can also be further isolated by using [Warehouses](/cloud/reference/warehouses#what-is-a-warehouse). Data is shared through a common object storage but each tenant can have its own compute service thanks to [compute-compute separation](/cloud/reference/warehouses#what-is-compute-compute-separation) with different CPU/Memory ratio. -User management is similar to the approaches described previously, since all services in a warehouse [share access controls](/cloud/reference/warehouses#database-credentials). +User management is similar to the approaches described previously, since all services in a warehouse [share access controls](/cloud/reference/warehouses#database-credentials). Note the number of child services in a warehouse is limited to a small number. See [Warehouse limitations](/cloud/reference/warehouses#limitations). ## Separate cloud service {#separate-service} -The most radical approach is to use a different ClickHouse service per tenant. +The most radical approach is to use a different ClickHouse service per tenant. > **This less common method would be a solution if tenants data are required to be stored in different regions - for legal, security or proximity reasons.** @@ -317,7 +317,7 @@ This approach is harder to manage and bring overhead with each service, as they ### Example {#separate-service-example} -This is an example of a separate service multi-tenancy model implementation. Note the example shows the creation of tables and users on one ClickHouse service, the same will have to be replicated on all services. +This is an example of a separate service multi-tenancy model implementation. Note the example shows the creation of tables and users on one ClickHouse service, the same will have to be replicated on all services. First, let's create the table `events` @@ -349,7 +349,7 @@ VALUES Then let's create two users `user_1` ```sql --- Create users +-- Create users CREATE USER user_1 IDENTIFIED BY '' ``` @@ -360,7 +360,7 @@ Then `GRANT SELECT` privileges on the corresponding table. GRANT SELECT ON events TO user_1 ``` -Now you can connect as `user_1` on the service for tenant 1 and run a simple select. Only rows from the first tenant are returned. +Now you can connect as `user_1` on the service for tenant 1 and run a simple select. Only rows from the first tenant are returned. ```sql -- Logged as user_1 @@ -373,6 +373,5 @@ FROM events 3. │ 6b4d12e4-447d-4398-b3fa-1c1e94d71a2f │ user_logout │ 2025-03-19 08:10:00 │ 1001 │ {"device": "desktop", "location": "LA"} │ 4. │ 83b5eb72-aba3-4038-bc52-6c08b6423615 │ purchase │ 2025-03-19 08:45:00 │ 1003 │ {"item": "monitor", "amount": 450} │ 5. │ 975fb0c8-55bd-4df4-843b-34f5cfeed0a9 │ user_login │ 2025-03-19 08:50:00 │ 1004 │ {"device": "desktop", "location": "LA"} │ - └──────────────────────────────────────┴─────────────┴─────────────────────┴─────────┴─────────────────────────────────────────┘ -``` - + └──────────────────────────────────────┴─────────────┴─────────────────────┴─────────┴─────────────────────────────────────────┘ + ``` diff --git a/docs/cloud/bestpractices/usagelimits.md b/docs/cloud/bestpractices/usagelimits.md index ca9dea88376..4fd97429562 100644 --- a/docs/cloud/bestpractices/usagelimits.md +++ b/docs/cloud/bestpractices/usagelimits.md @@ -8,7 +8,7 @@ description: 'Describes the recommended usage limits in ClickHouse Cloud' While ClickHouse is known for its speed and reliability, optimal performance is achieved within certain operating parameters. For example, having too many tables, databases or parts could negatively impact performance. To avoid this, Clickhouse Cloud has guardrails set up for several types of items. You can find details of these guardrails below. :::tip -If you've run up against one of these guardrails, it's possible that you are implementing your use case in an unoptimized way. Contact our support team and we will gladly help you refine your use case to avoid exceeding the guardrails or look together at how we can increase them in a controlled manner. +If you've run up against one of these guardrails, it's possible that you are implementing your use case in an unoptimized way. Contact our support team and we will gladly help you refine your use case to avoid exceeding the guardrails or look together at how we can increase them in a controlled manner. ::: | Dimension | Limit | @@ -29,5 +29,3 @@ If you've run up against one of these guardrails, it's possible that you are imp :::note For Single Replica Services, the maximum number of databases is restricted to 100, and the maximum number of tables is restricted to 500. In addition, storage for Basic Tier Services is limited to 1 TB. ::: - - diff --git a/docs/cloud/changelogs/24_02.md b/docs/cloud/changelogs/24_02.md index fa418da75a8..e406d700f9b 100644 --- a/docs/cloud/changelogs/24_02.md +++ b/docs/cloud/changelogs/24_02.md @@ -19,7 +19,7 @@ sidebar_position: 8 * The obsolete in-memory data parts have been deprecated since version 23.5 and have not been supported since version 23.10. Now the remaining code is removed. Continuation of [#55186](https://github.com/ClickHouse/ClickHouse/issues/55186) and [#45409](https://github.com/ClickHouse/ClickHouse/issues/45409). It is unlikely that you have used in-memory data parts because they were available only before version 23.5 and only when you enabled them manually by specifying the corresponding SETTINGS for a MergeTree table. To check if you have in-memory data parts, run the following query: `SELECT part_type, count() FROM system.parts GROUP BY part_type ORDER BY part_type`. To disable the usage of in-memory data parts, do `ALTER TABLE ... MODIFY SETTING min_bytes_for_compact_part = DEFAULT, min_rows_for_compact_part = DEFAULT`. Before upgrading from old ClickHouse releases, first check that you don't have in-memory data parts. If there are in-memory data parts, disable them first, then wait while there are no in-memory data parts and continue the upgrade. [#61127](https://github.com/ClickHouse/ClickHouse/pull/61127) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Forbid `SimpleAggregateFunction` in `ORDER BY` of `MergeTree` tables (like `AggregateFunction` is forbidden, but they are forbidden because they are not comparable) by default (use `allow_suspicious_primary_key` to allow them). [#61399](https://github.com/ClickHouse/ClickHouse/pull/61399) ([Azat Khuzhin](https://github.com/azat)). * ClickHouse allows arbitrary binary data in the String data type, which is typically UTF-8. Parquet/ORC/Arrow Strings only support UTF-8. That's why you can choose which Arrow's data type to use for the ClickHouse String data type - String or Binary. This is controlled by the settings, `output_format_parquet_string_as_string`, `output_format_orc_string_as_string`, `output_format_arrow_string_as_string`. While Binary would be more correct and compatible, using String by default will correspond to user expectations in most cases. Parquet/ORC/Arrow supports many compression methods, including lz4 and zstd. ClickHouse supports each and every compression method. Some inferior tools lack support for the faster `lz4` compression method, that's why we set `zstd` by default. This is controlled by the settings `output_format_parquet_compression_method`, `output_format_orc_compression_method`, and `output_format_arrow_compression_method`. We changed the default to `zstd` for Parquet and ORC, but not Arrow (it is emphasized for low-level usages). [#61817](https://github.com/ClickHouse/ClickHouse/pull/61817) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix for the materialized view security issue, which allowed a user to insert into a table without required grants for that. Fix validates that the user has permission to insert not only into a materialized view but also into all underlying tables. This means that some queries, which worked before, now can fail with Not enough privileges. To address this problem, the release introduces a new feature of SQL security for views [https://clickhouse.com/docs/sql-reference/statements/create/view#sql_security](/sql-reference/statements/create/view#sql_security). [#54901](https://github.com/ClickHouse/ClickHouse/pull/54901) ([pufit](https://github.com/pufit)) +* Fix for the materialized view security issue, which allowed a user to insert into a table without required grants for that. Fix validates that the user has permission to insert not only into a materialized view but also into all underlying tables. This means that some queries, which worked before, now can fail with Not enough privileges. To address this problem, the release introduces a new feature of SQL security for views [https://clickhouse.com/docs/sql-reference/statements/create/view#sql_security](/sql-reference/statements/create/view#sql_security). [#54901](https://github.com/ClickHouse/ClickHouse/pull/54901) ([pufit](https://github.com/pufit)) #### New feature {#new-feature} * Topk/topkweighed support mode, which return count of values and it's error. [#54508](https://github.com/ClickHouse/ClickHouse/pull/54508) ([UnamedRus](https://github.com/UnamedRus)). diff --git a/docs/cloud/changelogs/24_05.md b/docs/cloud/changelogs/24_05.md index 62e97dd6e9c..b6f90bf2eea 100644 --- a/docs/cloud/changelogs/24_05.md +++ b/docs/cloud/changelogs/24_05.md @@ -21,7 +21,6 @@ Relevant changes for ClickHouse Cloud services based on the v24.5 release. * Usage of functions neighbor, runningAccumulate, runningDifferenceStartingWithFirstValue, runningDifference deprecated (because it is error-prone). Proper window functions should be used instead. To enable them back, set allow_deprecated_error_prone_window_functions=1. [#63132](https://github.com/ClickHouse/ClickHouse/pull/63132) (Nikita Taranov). - ## Backward incompatible changes {#backward-incompatible-changes} * In the new ClickHouse version, the functions geoDistance, greatCircleDistance, and greatCircleAngle will use 64-bit double precision floating point data type for internal calculations and return type if all the arguments are Float64. This closes #58476. In previous versions, the function always used Float32. You can switch to the old behavior by setting geo_distance_returns_float64_on_float64_arguments to false or setting compatibility to 24.2 or earlier. [#61848](https://github.com/ClickHouse/ClickHouse/pull/61848) (Alexey Milovidov). diff --git a/docs/cloud/changelogs/24_08.md b/docs/cloud/changelogs/24_08.md index 78acd3f5f98..09a7aa86779 100644 --- a/docs/cloud/changelogs/24_08.md +++ b/docs/cloud/changelogs/24_08.md @@ -29,7 +29,6 @@ Relevant changes for ClickHouse Cloud services based on the v24.8 release. - Fix REPLACE modifier formatting (forbid omitting brackets). [#67774](https://github.com/ClickHouse/ClickHouse/pull/67774) (Azat Khuzhin). - ## New feature {#new-feature} - Extend function tuple to construct named tuples in query. Introduce function tupleNames to extract names from tuples. [#54881](https://github.com/ClickHouse/ClickHouse/pull/54881) (Amos Bird). diff --git a/docs/cloud/changelogs/24_10.md b/docs/cloud/changelogs/24_10.md index c62c6c3fc68..419c488272b 100644 --- a/docs/cloud/changelogs/24_10.md +++ b/docs/cloud/changelogs/24_10.md @@ -17,7 +17,6 @@ Relevant changes for ClickHouse Cloud services based on the v24.10 release. - Reordering of filter conditions from `[PRE]WHERE` clause is now allowed by default. It could be disabled by setting `allow_reorder_prewhere_conditions` to `false`. [#70657](https://github.com/ClickHouse/ClickHouse/pull/70657) ([Nikita Taranov](https://github.com/nickitat)). - Fix `optimize_functions_to_subcolumns` optimization (previously could lead to `Invalid column type for ColumnUnique::insertRangeFrom. Expected String, got LowCardinality(String)` error), by preserving `LowCardinality` type in `mapKeys`/`mapValues`. [#70716](https://github.com/ClickHouse/ClickHouse/pull/70716) ([Azat Khuzhin](https://github.com/azat)). - ## New feature {#new-feature} - Refreshable materialized views are production ready. [#70550](https://github.com/ClickHouse/ClickHouse/pull/70550) ([Michael Kolupaev](https://github.com/al13n321)). Refreshable materialized views are now supported in Replicated databases. [#60669](https://github.com/ClickHouse/ClickHouse/pull/60669) ([Michael Kolupaev](https://github.com/al13n321)). - Function `toStartOfInterval()` now has a new overload which emulates TimescaleDB's `time_bucket()` function, respectively PostgreSQL's `date_bin()` function. ([#55619](https://github.com/ClickHouse/ClickHouse/issues/55619)). It allows to align date or timestamp values to multiples of a given interval from an *arbitrary* origin (instead of 0000-01-01 00:00:00.000 as *fixed* origin). For example, `SELECT toStartOfInterval(toDateTime('2023-01-01 14:45:00'), INTERVAL 1 MINUTE, toDateTime('2023-01-01 14:35:30'));` returns `2023-01-01 14:44:30` which is a multiple of 1 minute intervals, starting from origin `2023-01-01 14:35:30`. [#56738](https://github.com/ClickHouse/ClickHouse/pull/56738) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). @@ -51,5 +50,3 @@ Relevant changes for ClickHouse Cloud services based on the v24.10 release. - Allow to read/write JSON type as binary string in RowBinary format under settings `input_format_binary_read_json_as_string/output_format_binary_write_json_as_string`. [#70288](https://github.com/ClickHouse/ClickHouse/pull/70288) ([Pavel Kruglov](https://github.com/Avogar)). - Allow to serialize/deserialize JSON column as single String column in Native format. For output use setting `output_format_native_write_json_as_string`. For input, use serialization version `1` before the column data. [#70312](https://github.com/ClickHouse/ClickHouse/pull/70312) ([Pavel Kruglov](https://github.com/Avogar)). - Supports standard CTE, `with insert`, as previously only supports `insert ... with ...`. [#70593](https://github.com/ClickHouse/ClickHouse/pull/70593) ([Shichao Jin](https://github.com/jsc0218)). - - diff --git a/docs/cloud/get-started/query-endpoints.md b/docs/cloud/get-started/query-endpoints.md index 07c332e5fdc..07ccdeda2cf 100644 --- a/docs/cloud/get-started/query-endpoints.md +++ b/docs/cloud/get-started/query-endpoints.md @@ -116,14 +116,14 @@ None - `queryVariables` (optional): An object containing variables to be used in the query. - `format` (optional): The format of the response. If Query API Endpoint is version 2 any ClickHouse supported format is possible. Supported formats for v1 are: - - TabSeparated - - TabSeparatedWithNames - - TabSeparatedWithNamesAndTypes - - JSON - - JSONEachRow - - CSV - - CSVWithNames - - CSVWithNamesAndTypes + - TabSeparated + - TabSeparatedWithNames + - TabSeparatedWithNamesAndTypes + - JSON + - JSONEachRow + - CSV + - CSVWithNames + - CSVWithNamesAndTypes ### Responses {#responses} diff --git a/docs/cloud/get-started/sql-console.md b/docs/cloud/get-started/sql-console.md index 41f1e3bd94c..fd114373ac0 100644 --- a/docs/cloud/get-started/sql-console.md +++ b/docs/cloud/get-started/sql-console.md @@ -82,7 +82,7 @@ To sort a table in the SQL console, open a table and select the 'Sort' button in sort descending on a column -The SQL console also allows you to add multiple sorts to a table. Click the 'Sort' button again to add another sort. +The SQL console also allows you to add multiple sorts to a table. Click the 'Sort' button again to add another sort. :::note Sorts are applied in the order that they appear in the sort pane (top to bottom). To remove a sort, simply click the 'x' button next to the sort. @@ -127,7 +127,7 @@ There are two ways to create a new query in the SQL console. - Click the '+' button in the tab bar - Select the 'New Query' button from the left sidebar query list -Creating a query + Creating a query ### Running a query {#running-a-query} @@ -139,23 +139,23 @@ By default, clicking the run button will run all commands contained in the SQL E - Run selected command(s) - Run command at the cursor -To run selected command(s), highlight the desired command or sequence of commands and click the 'Run' button (or use the `cmd / ctrl + enter` shortcut). You can also select 'Run selected' from the SQL Editor context menu (opened by right-clicking anywhere within the editor) when a selection is present. + To run selected command(s), highlight the desired command or sequence of commands and click the 'Run' button (or use the `cmd / ctrl + enter` shortcut). You can also select 'Run selected' from the SQL Editor context menu (opened by right-clicking anywhere within the editor) when a selection is present. -run selected query + run selected query -Running the command at the current cursor position can be achieved in two ways: + Running the command at the current cursor position can be achieved in two ways: - Select 'At Cursor' from the extended run options menu (or use the corresponding `cmd / ctrl + shift + enter` keyboard shortcut -run at cursor + run at cursor - - Selecting 'Run at cursor' from the SQL Editor context menu + - Selecting 'Run at cursor' from the SQL Editor context menu -run at cursor + run at cursor -:::note -The command present at the cursor position will flash yellow on execution. -::: + :::note + The command present at the cursor position will flash yellow on execution. + ::: ### Canceling a query {#canceling-a-query} @@ -188,23 +188,23 @@ The SQL console allows you to easily share queries with your team members. The S - Read-only access - No access -After saving a query, click the "Share" button in the toolbar. A modal with sharing options will appear: + After saving a query, click the "Share" button in the toolbar. A modal with sharing options will appear: -Share query + Share query -To adjust query access for all organization members with access to the service, simply adjust the access level selector in the top line: + To adjust query access for all organization members with access to the service, simply adjust the access level selector in the top line: -Edit access + Edit access -After applying the above, the query can now be viewed (and executed) by all team members with access to the SQL console for the service. + After applying the above, the query can now be viewed (and executed) by all team members with access to the SQL console for the service. -To adjust query access for specific members, select the desired team member from the "Add a team member" selector: + To adjust query access for specific members, select the desired team member from the "Add a team member" selector: -Add team member + Add team member -After selecting a team member, a new line item should appear with an access level selector: + After selecting a team member, a new line item should appear with an access level selector: -Edit team member access + Edit team member access ### Accessing shared queries {#accessing-shared-queries} @@ -298,10 +298,10 @@ A number of more advanced chart characteristics can also be adjusted in the 'Adv - Axis titles - Label orientation for the x-axis -Our chart will be updated accordingly: + Our chart will be updated accordingly: -Update subtitle etc. + Update subtitle etc. -In some scenarios, it may be necessary to adjust the axis scales for each field independently. This can also be accomplished in the 'Advanced' section of the chart configuration pane by specifying min and max values for an axis range. As an example, the above chart looks good, but in order to demonstrate the correlation between our `trip_total` and `fare_total` fields, the axis ranges need some adjustment: + In some scenarios, it may be necessary to adjust the axis scales for each field independently. This can also be accomplished in the 'Advanced' section of the chart configuration pane by specifying min and max values for an axis range. As an example, the above chart looks good, but in order to demonstrate the correlation between our `trip_total` and `fare_total` fields, the axis ranges need some adjustment: -Adjust axis scale + Adjust axis scale diff --git a/docs/cloud/manage/account-close.md b/docs/cloud/manage/account-close.md index 6aed00bf62c..d6d02746604 100644 --- a/docs/cloud/manage/account-close.md +++ b/docs/cloud/manage/account-close.md @@ -41,16 +41,15 @@ below. 4. Under 'Support' click 'Create case.' 5. In the 'Create new case' screen, enter the following: -```text -Priority: Severity 3 -Subject: Please close my ClickHouse account -Description: We would appreciate it if you would share a brief note about why you are cancelling. -``` + ```text + Priority: Severity 3 + Subject: Please close my ClickHouse account + Description: We would appreciate it if you would share a brief note about why you are cancelling. + ``` 5. Click 'Create new case' 6. We will close your account and send a confirmation email to let you know when it is complete. - ## Request deletion of your personal data {#request-personal-data-deletion} Please note, only account administrators may request personal data deletion from ClickHouse. If you are not an account administrator, please contact your ClickHouse account administrator to request to be removed from the account. diff --git a/docs/cloud/manage/api/api-overview.md b/docs/cloud/manage/api/api-overview.md index 0d006650519..4b436479e41 100644 --- a/docs/cloud/manage/api/api-overview.md +++ b/docs/cloud/manage/api/api-overview.md @@ -10,9 +10,9 @@ description: 'Learn about ClickHouse Cloud API' ## Overview {#overview} -The ClickHouse Cloud API is a REST API designed for developers to easily manage -organizations and services on ClickHouse Cloud. Using our Cloud API, you can -create and manage services, provision API keys, add or remove members in your +The ClickHouse Cloud API is a REST API designed for developers to easily manage +organizations and services on ClickHouse Cloud. Using our Cloud API, you can +create and manage services, provision API keys, add or remove members in your organization, and more. [Learn how to create your first API key and start using the ClickHouse Cloud API.](/cloud/manage/openapi.md) @@ -27,9 +27,9 @@ the [Swagger UI](https://clickhouse.com/docs/cloud/manage/api/swagger). ## Rate limits {#rate-limits} -Developers are limited to 100 API keys per organization. Each API key has a -limit of 10 requests over a 10-second window. If you'd like to increase the -number of API keys or requests per 10-second window for your organization, +Developers are limited to 100 API keys per organization. Each API key has a +limit of 10 requests over a 10-second window. If you'd like to increase the +number of API keys or requests per 10-second window for your organization, please contact support@clickhouse.com ## Terraform provider {#terraform-provider} @@ -40,11 +40,11 @@ less error-prone. You can view the Terraform provider docs in the [Terraform registry](https://registry.terraform.io/providers/ClickHouse/clickhouse/latest/docs). -If you'd like to contribute to the ClickHouse Terraform Provider, you can view +If you'd like to contribute to the ClickHouse Terraform Provider, you can view the source [in the GitHub repo](https://github.com/ClickHouse/terraform-provider-clickhouse). ## Support {#support} -We recommend visiting [our Slack channel](https://clickhouse.com/slack) first to get quick support. If -you'd like additional help or more info about our API and its capabilities, +We recommend visiting [our Slack channel](https://clickhouse.com/slack) first to get quick support. If +you'd like additional help or more info about our API and its capabilities, please contact ClickHouse Support at https://console.clickhouse.cloud/support diff --git a/docs/cloud/manage/api/index.md b/docs/cloud/manage/api/index.md index da8d8e74c0a..34b5958fda7 100644 --- a/docs/cloud/manage/api/index.md +++ b/docs/cloud/manage/api/index.md @@ -8,6 +8,6 @@ This section contains reference documentation for Cloud API and contains the fol | Page | Description | |--------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------| -| [Overview](/cloud/manage/api/api-overview) | Provides an overview of rate limits, Terraform Provider, Swagger (OpenAPI) Endpoint and UI and available support. | +| [Overview](/cloud/manage/api/api-overview) | Provides an overview of rate limits, Terraform Provider, Swagger (OpenAPI) Endpoint and UI and available support. | | [Managing API Keys](/cloud/manage/openapi) | Learn more about Cloud's API utilizing OpenAPI that allows you to programmatically manage your account and aspects of your services. | -| [API Reference](https://clickhouse.com/docs/cloud/manage/api/swagger) | OpenAPI (swagger) reference page. | \ No newline at end of file +| [API Reference](https://clickhouse.com/docs/cloud/manage/api/swagger) | OpenAPI (swagger) reference page. | diff --git a/docs/cloud/manage/backups/configurable-backups.md b/docs/cloud/manage/backups/configurable-backups.md index d4e71ea205a..c4b7dcc7d88 100644 --- a/docs/cloud/manage/backups/configurable-backups.md +++ b/docs/cloud/manage/backups/configurable-backups.md @@ -20,22 +20,22 @@ ClickHouse Cloud allows you to configure the schedule for your backups for **Sca - **Frequency**: The frequency allows you to specify the time duration between subsequent backups. For instance, a frequency of "every 12 hours" means that backups will be spaced 12 hours apart. Frequency can range from "every 6 hours" to "every 48 hours" in the following hourly increments: `6`, `8`, `12`, `16`, `20`, `24`, `36`, `48`. - **Start Time**: The start time for when you want to schedule backups each day. Specifying a start time implies that the backup "Frequency" will default to once every 24 hours. Clickhouse Cloud will start the backup within an hour of the specified start time. -:::note -The custom schedule will override the default backup policy in ClickHouse Cloud for your given service. -::: + :::note + The custom schedule will override the default backup policy in ClickHouse Cloud for your given service. + ::: -To configure the backup schedule for a service, go to the **Settings** tab in the console and click on **Change backup configuration**. + To configure the backup schedule for a service, go to the **Settings** tab in the console and click on **Change backup configuration**. -Configure backup settings + Configure backup settings -This opens a tab to the right where you can choose values for retention, frequency, and start time. You will need to save the chosen settings for them to take effect. + This opens a tab to the right where you can choose values for retention, frequency, and start time. You will need to save the chosen settings for them to take effect. -Select backup retention and frequency + Select backup retention and frequency -:::note -Start time and frequency are mutually exclusive. Start time takes precedence. -::: + :::note + Start time and frequency are mutually exclusive. Start time takes precedence. + ::: -:::note -Changing the backup schedule can cause higher monthly charges for storage as some of the backups might not be covered in the default backups for the service. See ["Understanding backup cost"](./overview.md/#understanding-backup-cost) section below. -::: + :::note + Changing the backup schedule can cause higher monthly charges for storage as some of the backups might not be covered in the default backups for the service. See ["Understanding backup cost"](./overview.md/#understanding-backup-cost) section below. + ::: diff --git a/docs/cloud/manage/backups/export-backups-to-own-cloud-account.md b/docs/cloud/manage/backups/export-backups-to-own-cloud-account.md index d8e9e34f7cd..6333a362e21 100644 --- a/docs/cloud/manage/backups/export-backups-to-own-cloud-account.md +++ b/docs/cloud/manage/backups/export-backups-to-own-cloud-account.md @@ -30,14 +30,13 @@ You will need the following details to export/restore backups to your own CSP st s3://.s3.amazonaws.com/ ``` - For example: + For example: ```text s3://testchbackups.s3.amazonaws.com/backups/ - ``` + ``` Where: - - `testchbackups` is the name of the S3 bucket to export backups to. - - `backups` is an optional subdirectory. - + - `testchbackups` is the name of the S3 bucket to export backups to. + - `backups` is an optional subdirectory. 2. AWS access key and secret. @@ -56,7 +55,7 @@ You will need the following details to export/restore backups to your own CSP st ``` 2. Access HMAC key and HMAC secret. -


+
# Backup / Restore ## Backup / Restore to AWS S3 Bucket {#backup--restore-to-aws-s3-bucket} @@ -66,7 +65,7 @@ You will need the following details to export/restore backups to your own CSP st **Full Backup** ```sql -BACKUP DATABASE test_backups +BACKUP DATABASE test_backups TO S3('https://testchbackups.s3.amazonaws.com/backups/', '', '') ``` @@ -74,22 +73,22 @@ Where `uuid` is a unique identifier, used to differentiate a set of backups. :::note You will need to use a different UUID for each new backup in this subdirectory, otherwise you will get a `BACKUP_ALREADY_EXISTS` error. -For example, if you are taking daily backups, you will need to use a new UUID each day. +For example, if you are taking daily backups, you will need to use a new UUID each day. ::: **Incremental Backup** ```sql -BACKUP DATABASE test_backups -TO S3('https://testchbackups.s3.amazonaws.com/backups/', '', '') +BACKUP DATABASE test_backups +TO S3('https://testchbackups.s3.amazonaws.com/backups/', '', '') SETTINGS base_backup = S3('https://testchbackups.s3.amazonaws.com/backups/', '', '') ``` ### Restore from a backup {#restore-from-a-backup} ```sql -RESTORE DATABASE test_backups -AS test_backups_restored +RESTORE DATABASE test_backups +AS test_backups_restored FROM S3('https://testchbackups.s3.amazonaws.com/backups/', '', '') ``` @@ -102,7 +101,7 @@ See: [Configuring BACKUP/RESTORE to use an S3 Endpoint](/operations/backup#confi **Full Backup** ```sql -BACKUP DATABASE test_backups +BACKUP DATABASE test_backups TO AzureBlobStorage('', '', '/'); ``` @@ -111,16 +110,16 @@ Where `uuid` is a unique identifier, used to differentiate a set of backups. **Incremental Backup** ```sql -BACKUP DATABASE test_backups -TO AzureBlobStorage('', '', '//my_incremental') +BACKUP DATABASE test_backups +TO AzureBlobStorage('', '', '//my_incremental') SETTINGS base_backup = AzureBlobStorage('', '', '/') ``` ### Restore from a backup {#restore-from-a-backup-1} ```sql -RESTORE DATABASE test_backups -AS test_backups_restored_azure +RESTORE DATABASE test_backups +AS test_backups_restored_azure FROM AzureBlobStorage('', '', '/') ``` @@ -133,7 +132,7 @@ See: [Configuring BACKUP/RESTORE to use an S3 Endpoint](/operations/backup#confi **Full Backup** ```sql -BACKUP DATABASE test_backups +BACKUP DATABASE test_backups TO S3('https://storage.googleapis.com//', ', ) ``` Where `uuid` is a unique identifier, used to differentiate a set of backups. @@ -141,7 +140,7 @@ Where `uuid` is a unique identifier, used to differentiate a set of backups. **Incremental Backup** ```sql -BACKUP DATABASE test_backups +BACKUP DATABASE test_backups TO S3('https://storage.googleapis.com/test_gcs_backups//my_incremental', 'key', 'secret') SETTINGS base_backup = S3('https://storage.googleapis.com/test_gcs_backups/', 'key', 'secret') ``` @@ -149,7 +148,7 @@ SETTINGS base_backup = S3('https://storage.googleapis.com/test_gcs_backups/', 'key', 'secret') ``` diff --git a/docs/cloud/manage/backups/overview.md b/docs/cloud/manage/backups/overview.md index 1e8f85b0c22..2cf849be2a7 100644 --- a/docs/cloud/manage/backups/overview.md +++ b/docs/cloud/manage/backups/overview.md @@ -50,17 +50,15 @@ To understand the backup cost, you can view the backup cost per service from the Backup usage chart in ClickHouse Cloud - Estimating the total cost for your backups requires you to set a schedule. We are also working on updating our [pricing calculator](https://clickhouse.com/pricing), so you can get a monthly cost estimate before setting a schedule. You will need to provide the following inputs in order to estimate the cost: - Size of the full and incremental backups - Desired frequency - Desired retention - Cloud provider and region -:::note -Keep in mind that the estimated cost for backups will change as the size of the data in the service grows over time. -::: - + :::note + Keep in mind that the estimated cost for backups will change as the size of the data in the service grows over time. + ::: ## Restore a backup {#restore-a-backup} diff --git a/docs/cloud/manage/billing.md b/docs/cloud/manage/billing.md index 1def3aac523..bb4aca49c83 100644 --- a/docs/cloud/manage/billing.md +++ b/docs/cloud/manage/billing.md @@ -8,7 +8,7 @@ description: 'Overview page for ClickHouse Cloud pricing' import ClickPipesFAQ from './jan2025_faq/_snippets/_clickpipes_faq.md' For pricing information, see the [ClickHouse Cloud Pricing](https://clickhouse.com/pricing#pricing-calculator) page. -ClickHouse Cloud bills based on the usage of compute, storage, [data transfer](/cloud/manage/network-data-transfer) (egress over the internet and cross-region), and [ClickPipes](/integrations/clickpipes). +ClickHouse Cloud bills based on the usage of compute, storage, [data transfer](/cloud/manage/network-data-transfer) (egress over the internet and cross-region), and [ClickPipes](/integrations/clickpipes). To understand what can affect your bill, and ways that you can manage your spend, keep reading. ## Amazon Web Services (AWS) example {#amazon-web-services-aws-example} @@ -16,7 +16,7 @@ To understand what can affect your bill, and ways that you can manage your spend :::note - Prices reflect AWS us-east-1 pricing. - Explore applicable data transfer and ClickPipes charges [here](jan2025_faq/dimensions.md). -::: + ::: ### Basic: from $66.52 per month {#basic-from-6652-per-month} @@ -29,48 +29,48 @@ Best for: Departmental use cases with smaller data volumes that do not have hard - 10 GB of public internet egress data transfer - 5 GB of cross-region data transfer -Pricing breakdown for this example: + Pricing breakdown for this example: - - +
+ - - - + + + - - + + - - + + - - + + - - + + - - -
Active 6 hours a day Active 12 hours a day Active 24 hours a day
Compute \$39.91 \$79.83 \$159.66
Storage \$25.30 \$25.30 \$25.30
Public internet egress data transfer \$1.15 \$1.15 \$1.15
Cross-region data transfer \$0.16 \$0.16 \$0.16
Total \$66.52 \$106.44 \$186.27
+ + + ### Scale (always-on, auto-scaling): from $499.38 per month {#scale-always-on-auto-scaling-from-49938-per-month} @@ -82,48 +82,48 @@ Best for: workloads requiring enhanced SLAs (2+ replica services), scalability, - 100 GB of public internet egress data transfer - 10 GB of cross-region data transfer -Pricing breakdown for this example: + Pricing breakdown for this example: - - +
+ - - - + + + - - + + - - + + - - + + - - + + - - -
Example 1 Example 2 Example 3
Compute 2 replicas x 8 GiB RAM, 2 vCPU

\$436.95
2 replicas x 16 GiB RAM, 4 vCPU

\$873.89
3 replicas x 16 GiB RAM, 4 vCPU

\$1,310.84
Storage 1 TB of data + 1 backup

\$50.60
2 TB of data + 1 backup

\$101.20
3 TB of data + 1 backup

\$151.80
Public internet egress data transfer \$11.52 \$11.52 \$11.52
Cross-region data transfer \$0.31 \$0.31 \$0.31
Total \$499.38 \$986.92 \$1,474.47
+ + + ### Enterprise: Starting prices vary {#enterprise-starting-prices-vary} @@ -134,85 +134,85 @@ Best for: large scale, mission critical deployments that have stringent security - 1 TB of public internet egress data transfer - 500 GB of cross-region data transfer - - +
+ - - - + + + - - + + - - + + - - + + - - + + - - -
Example 1 Example 2 Example 3
Compute 2 replicas x 32 GiB RAM, 8 vCPU

\$2,285.60
2 replicas x 64 GiB RAM, 16 vCPU

\$4,571.19
2 x 120 GiB RAM, 30 vCPU

\$8,570.99
Storage 5 TB + 1 backup

\$253.00
10 TB + 1 backup

\$506.00
20 TB + 1 backup

\$1,012.00
Public internet egress data transfer \$115.20 \$115.20 \$115.20
Cross-region data transfer \$15.60 \$15.60 \$15.60
Total \$2,669.40 \$5,207.99 \$9,713.79
+ + + ## Frequently asked questions {#faqs} ### How is compute metered? {#how-is-compute-metered} -ClickHouse Cloud meters compute on a per-minute basis, in 8G RAM increments. +ClickHouse Cloud meters compute on a per-minute basis, in 8G RAM increments. Compute costs will vary by tier, region, and cloud service provider. ### How is storage on disk calculated? {#how-is-storage-on-disk-calculated} -ClickHouse Cloud uses cloud object storage and usage is metered on the compressed size of data stored in ClickHouse tables. -Storage costs are the same across tiers and vary by region and cloud service provider. +ClickHouse Cloud uses cloud object storage and usage is metered on the compressed size of data stored in ClickHouse tables. +Storage costs are the same across tiers and vary by region and cloud service provider. ### Do backups count toward total storage? {#do-backups-count-toward-total-storage} -Storage and backups are counted towards storage costs and billed separately. -All services will default to one backup, retained for a day. +Storage and backups are counted towards storage costs and billed separately. +All services will default to one backup, retained for a day. Users who need additional backups can do so by configuring additional [backups](backups/overview.md) under the settings tab of the Cloud console. ### How do I estimate compression? {#how-do-i-estimate-compression} -Compression can vary from dataset to dataset. -How much it varies is dependent on how compressible the data is in the first place (number of high vs. low cardinality fields), -and how the user sets up the schema (using optional codecs or not, for instance). -It can be on the order of 10x for common types of analytical data, but it can be significantly lower or higher as well. -See the [optimizing documentation](/optimize/asynchronous-inserts) for guidance and this [Uber blog](https://www.uber.com/blog/logging/) for a detailed logging use case example. +Compression can vary from dataset to dataset. +How much it varies is dependent on how compressible the data is in the first place (number of high vs. low cardinality fields), +and how the user sets up the schema (using optional codecs or not, for instance). +It can be on the order of 10x for common types of analytical data, but it can be significantly lower or higher as well. +See the [optimizing documentation](/optimize/asynchronous-inserts) for guidance and this [Uber blog](https://www.uber.com/blog/logging/) for a detailed logging use case example. The only practical way to know exactly is to ingest your dataset into ClickHouse and compare the size of the dataset with the size stored in ClickHouse. You can use the query: ```sql title="Estimating compression" -SELECT formatReadableSize(total_bytes) -FROM system.tables +SELECT formatReadableSize(total_bytes) +FROM system.tables WHERE name = ``` ### What tools does ClickHouse offer to estimate the cost of running a service in the cloud if I have a self-managed deployment? {#what-tools-does-clickhouse-offer-to-estimate-the-cost-of-running-a-service-in-the-cloud-if-i-have-a-self-managed-deployment} -The ClickHouse query log captures [key metrics](/operations/system-tables/query_log) that can be used to estimate the cost of running a workload in ClickHouse Cloud. +The ClickHouse query log captures [key metrics](/operations/system-tables/query_log) that can be used to estimate the cost of running a workload in ClickHouse Cloud. For details on migrating from self-managed to ClickHouse Cloud please refer to the [migration documentation](/cloud/migration/clickhouse-to-cloud), and contact [ClickHouse Cloud support](https://console.clickhouse.cloud/support) if you have further questions. ### What billing options are available for ClickHouse Cloud? {#what-billing-options-are-available-for-clickhouse-cloud} @@ -258,101 +258,101 @@ All marketplace subscriptions are billed and invoiced by the marketplace. You ca ### Why do the dates on the Usage statements not match my Marketplace Invoice? {#why-do-the-dates-on-the-usage-statements-not-match-my-marketplace-invoice} AWS Marketplace billing follows the calendar month cycle. -For example, for usage between dates 01-Dec-2024 and 01-Jan-2025, +For example, for usage between dates 01-Dec-2024 and 01-Jan-2025, an invoice is generated between 3-Jan and 5-Jan-2025 -ClickHouse Cloud usage statements follow a different billing cycle where usage is metered +ClickHouse Cloud usage statements follow a different billing cycle where usage is metered and reported over 30 days starting from the day of sign up. The usage and invoice dates will differ if these dates are not the same. Since usage statements track usage by day for a given service, users can rely on statements to see the breakdown of costs. ### Are there any restrictions around the usage of prepaid credits? {#are-there-any-restrictions-around-the-usage-of-prepaid-credits} -ClickHouse Cloud prepaid credits (whether direct through ClickHouse, or via a cloud provider's marketplace) -can only be leveraged for the terms of the contract. -This means they can be applied on the acceptance date, or a future date, and not for any prior periods. +ClickHouse Cloud prepaid credits (whether direct through ClickHouse, or via a cloud provider's marketplace) +can only be leveraged for the terms of the contract. +This means they can be applied on the acceptance date, or a future date, and not for any prior periods. Any overages not covered by prepaid credits must be covered by a credit card payment or marketplace monthly billing. ### Is there a difference in ClickHouse Cloud pricing, whether paying through the cloud provider marketplace or directly to ClickHouse? {#is-there-a-difference-in-clickhouse-cloud-pricing-whether-paying-through-the-cloud-provider-marketplace-or-directly-to-clickhouse} -There is no difference in pricing between marketplace billing and signing up directly with ClickHouse. -In either case, your usage of ClickHouse Cloud is tracked in terms of ClickHouse Cloud Credits (CHCs), +There is no difference in pricing between marketplace billing and signing up directly with ClickHouse. +In either case, your usage of ClickHouse Cloud is tracked in terms of ClickHouse Cloud Credits (CHCs), which are metered in the same way and billed accordingly. ### How is compute-compute separation billed? {#how-is-compute-compute-separation-billed} -When creating a service in addition to an existing service, -you can choose if this new service should share the same data with the existing one. -If yes, these two services now form a [warehouse](../reference/warehouses.md). +When creating a service in addition to an existing service, +you can choose if this new service should share the same data with the existing one. +If yes, these two services now form a [warehouse](../reference/warehouses.md). A warehouse has the data stored in it with multiple compute services accessing this data. -As the data is stored only once, you only pay for one copy of data, though multiple services are accessing it. +As the data is stored only once, you only pay for one copy of data, though multiple services are accessing it. You pay for compute as usual — there are no additional fees for compute-compute separation / warehouses. By leveraging shared storage in this deployment, users benefit from cost savings on both storage and backups. -Compute-compute separation can save you a significant amount of ClickHouse Credits in some cases. +Compute-compute separation can save you a significant amount of ClickHouse Credits in some cases. A good example is the following setup: 1. You have ETL jobs that are running 24/7 and ingesting data into the service. These ETL jobs do not require a lot of memory so they can run on a small instance with, for example, 32 GiB of RAM. 2. A data scientist on the same team that has ad hoc reporting requirements, says they need to run a query that requires a significant amount of memory - 236 GiB, however does not need high availability and can wait and rerun queries if the first run fails. -In this example you, as an administrator for the database, can do the following: + In this example you, as an administrator for the database, can do the following: 1. Create a small service with two replicas 16 GiB each - this will satisfy the ETL jobs and provide high availability. 2. For the data scientist, you can create a second service in the same warehouse with only one replica with 236 GiB. You can enable idling for this service so you will not be paying for this service when the data scientist is not using it. -Cost estimation (per month) for this example on the **Scale Tier**: + Cost estimation (per month) for this example on the **Scale Tier**: - Parent service active 24 hours day: 2 replicas x 16 GiB 4 vCPU per replica - Child service: 1 replica x 236 GiB 59 vCPU per replica per replica - 3 TB of compressed data + 1 backup - 100 GB of public internet egress data transfer - 50 GB of cross-region data transfer - - +
+ - - - + + + - - + + - - + + - - + + - - + + - - -
Child service
active 1 hour/day
Child service
active 2 hours/day
Child service
active 4 hours/day
Compute \$1,142.43 \$1,410.97 \$1,948.05
Storage \$151.80 \$151.80 \$151.80
Public internet egress data transfer \$11.52 \$11.52 \$11.52
Cross-region data transfer \$1.56 \$1.56 \$1.56
Total \$1,307.31 \$1,575.85 \$2,112.93
+ + + -Without warehouses, you would have to pay for the amount of memory that the data engineer needs for his queries. -However, combining two services in a warehouse and idling one of them helps you save money. + Without warehouses, you would have to pay for the amount of memory that the data engineer needs for his queries. + However, combining two services in a warehouse and idling one of them helps you save money. ## ClickPipes pricing {#clickpipes-pricing} @@ -382,30 +382,30 @@ to make any changes. There are two main dimensions to pricing: 1. **Ingested Data**: The raw, uncompressed bytes coming from Postgres and - ingested into ClickHouse. + ingested into ClickHouse. 2. **Compute**: The compute units provisioned per service manage multiple - Postgres CDC ClickPipes and are separate from the compute units used by the - ClickHouse Cloud service. This additional compute is dedicated specifically - to Postgres CDC ClickPipes. Compute is billed at the service level, not per - individual pipe. Each compute unit includes 2 vCPUs and 8 GB of RAM. + Postgres CDC ClickPipes and are separate from the compute units used by the + ClickHouse Cloud service. This additional compute is dedicated specifically + to Postgres CDC ClickPipes. Compute is billed at the service level, not per + individual pipe. Each compute unit includes 2 vCPUs and 8 GB of RAM. #### Ingested data {#ingested-data} The Postgres CDC connector operates in two main phases: - **Initial load / resync**: This captures a full snapshot of Postgres tables - and occurs when a pipe is first created or re-synced. + and occurs when a pipe is first created or re-synced. - **Continuous Replication (CDC)**: Ongoing replication of changes—such as inserts, - updates, deletes, and schema changes—from Postgres to ClickHouse. + updates, deletes, and schema changes—from Postgres to ClickHouse. -In most use cases, continuous replication accounts for over 90% of a ClickPipe -life cycle. Because initial loads involve transferring a large volume of data all -at once, we offer a lower rate for that phase. + In most use cases, continuous replication accounts for over 90% of a ClickPipe + life cycle. Because initial loads involve transferring a large volume of data all + at once, we offer a lower rate for that phase. -| Phase | Cost | -|----------------------------------|--------------| -| **Initial load / resync** | $0.10 per GB | -| **Continuous Replication (CDC)** | $0.20 per GB | + | Phase | Cost | + |----------------------------------|--------------| + | **Initial load / resync** | $0.10 per GB | + | **Continuous Replication (CDC)** | $0.20 per GB | #### Compute {#compute} @@ -430,13 +430,13 @@ Let's say your service is in Scale tier and has the following setup: ##### Monthly cost breakdown {#cost-breakdown} -**Ingested Data (CDC)**: +**Ingested Data (CDC)**: -$$ 2 \text{ pipes} \times 500 \text{ GB} = 1,000 \text{ GB per month} $$ +$$ 2 \text{ pipes} \times 500 \text{ GB} = 1,000 \text{ GB per month} $$ $$ 1,000 \text{ GB} \times \$0.20/\text{GB} = \$200 $$ -**Compute**: +**Compute**: $$1 \text{ compute unit} \times \$0.20/\text{hr} \times 730 \text{ hours (approximate month)} = \$146$$ @@ -444,10 +444,10 @@ $$1 \text{ compute unit} \times \$0.20/\text{hr} \times 730 \text{ hours (approx Compute is shared across both pipes ::: -**Total Monthly Cost**: +**Total Monthly Cost**: $$\$200 \text{ (ingest)} + \$146 \text{ (compute)} = \$346$$ - + ### ClickPipes for streaming and object storage {#clickpipes-for-streaming-object-storage} This section outlines the pricing model of ClickPipes for streaming and object storage. @@ -457,12 +457,12 @@ This section outlines the pricing model of ClickPipes for streaming and object s It consists of two dimensions - **Compute**: Price per unit per hour - Compute represents the cost of running the ClickPipes replica pods whether they actively ingest data or not. - It applies to all ClickPipes types. + Compute represents the cost of running the ClickPipes replica pods whether they actively ingest data or not. + It applies to all ClickPipes types. - **Ingested data**: per GB pricing - The ingested data rate applies to all streaming ClickPipes - (Kafka, Confluent, Amazon MSK, Amazon Kinesis, Redpanda, WarpStream, Azure Event Hubs) - for the data transferred via the replica pods. The ingested data size (GB) is charged based on bytes received from the source (uncompressed or compressed). + The ingested data rate applies to all streaming ClickPipes + (Kafka, Confluent, Amazon MSK, Amazon Kinesis, Redpanda, WarpStream, Azure Event Hubs) + for the data transferred via the replica pods. The ingested data size (GB) is charged based on bytes received from the source (uncompressed or compressed). #### What are ClickPipes replicas? {#what-are-clickpipes-replicas} @@ -513,93 +513,69 @@ effective data transfer is assumed by the underlying Clickhouse Service_ ## ClickPipes pricing FAQ {#clickpipes-pricing-faq} Below, you will find frequently asked questions about CDC ClickPipes and streaming -and object-based storage ClickPipes. +and object-based storage ClickPipes. ### FAQ for Postgres CDC ClickPipes {#faq-postgres-cdc-clickpipe}
- Is the ingested data measured in pricing based on compressed or uncompressed size? - -The ingested data is measured as _uncompressed data_ coming from Postgres—both -during the initial load and CDC (via the replication slot). Postgres does not -compress data during transit by default, and ClickPipe processes the raw, +The ingested data is measured as _uncompressed data_ coming from Postgres—both +during the initial load and CDC (via the replication slot). Postgres does not +compress data during transit by default, and ClickPipe processes the raw, uncompressed bytes. -
- When will Postgres CDC pricing start appearing on my bills? - Postgres CDC ClickPipes pricing begins appearing on monthly bills starting -**September 1st, 2025**, for all customers—both existing and new. Until then, +**September 1st, 2025**, for all customers—both existing and new. Until then, usage is free. Customers have a **3-month window** starting from **May 29** (the GA announcement date) to review and optimize their usage if needed, although we expect most won't need to make any changes. -
- Will I be charged if I pause my pipes? - -No data ingestion charges apply while a pipe is paused, since no data is moved. -However, compute charges still apply—either 0.5 or 1 compute unit—based on your -organization's tier. This is a fixed service-level cost and applies across all +No data ingestion charges apply while a pipe is paused, since no data is moved. +However, compute charges still apply—either 0.5 or 1 compute unit—based on your +organization's tier. This is a fixed service-level cost and applies across all pipes within that service. -
- How can I estimate my pricing? - The Overview page in ClickPipes provides metrics for both initial load/resync and -CDC data volumes. You can estimate your Postgres CDC costs using these metrics +CDC data volumes. You can estimate your Postgres CDC costs using these metrics in conjunction with the ClickPipes pricing. -
- Can I scale the compute allocated for Postgres CDC in my service? - -By default, compute scaling is not user-configurable. The provisioned resources -are optimized to handle most customer workloads optimally. If your use case -requires more or less compute, please open a support ticket so we can evaluate +By default, compute scaling is not user-configurable. The provisioned resources +are optimized to handle most customer workloads optimally. If your use case +requires more or less compute, please open a support ticket so we can evaluate your request. -
- What is the pricing granularity? - - **Compute**: Billed per hour. Partial hours are rounded up to the next hour. - **Ingested Data**: Measured and billed per gigabyte (GB) of uncompressed data. -
- Can I use my ClickHouse Cloud credits for Postgres CDC via ClickPipes? - -Yes. ClickPipes pricing is part of the unified ClickHouse Cloud pricing. Any +Yes. ClickPipes pricing is part of the unified ClickHouse Cloud pricing. Any platform credits you have will automatically apply to ClickPipes usage as well. -
- How much additional cost should I expect from Postgres CDC ClickPipes in my existing monthly ClickHouse Cloud spend? - -The cost varies based on your use case, data volume, and organization tier. -That said, most existing customers see an increase of **0–15%** relative to their -existing monthly ClickHouse Cloud spend post trial. Actual costs may vary -depending on your workload—some workloads involve high data volumes with +The cost varies based on your use case, data volume, and organization tier. +That said, most existing customers see an increase of **0–15%** relative to their +existing monthly ClickHouse Cloud spend post trial. Actual costs may vary +depending on your workload—some workloads involve high data volumes with lesser processing, while others require more processing with less data. -
### FAQ for streaming and object storage ClickPipes {#faq-streaming-and-object-storage} diff --git a/docs/cloud/manage/billing/index.md b/docs/cloud/manage/billing/index.md index 8f51124a3a7..f3a914e1d28 100644 --- a/docs/cloud/manage/billing/index.md +++ b/docs/cloud/manage/billing/index.md @@ -9,7 +9,7 @@ This section of the documentation covers topics related to billing, and contains | Page | Description | |---------------------------------------|----------------------------------------------------------------------| -| [Overview](/cloud/marketplace/marketplace-billing) | Overview and FAQ pages for marketplace billing. | +| [Overview](/cloud/marketplace/marketplace-billing) | Overview and FAQ pages for marketplace billing. | | [Payment Thresholds](/cloud/billing/payment-thresholds) | Learn more about how payment thresholds work and how to adjust them. | | [Troubleshooting Billing Issues](/manage/troubleshooting-billing-issues) | Troubleshoot common billing issues. | | [Marketplace](/cloud/manage/) | Landing page for further marketplace related topics. | diff --git a/docs/cloud/manage/billing/marketplace/aws-marketplace-committed.md b/docs/cloud/manage/billing/marketplace/aws-marketplace-committed.md index 4542c296787..8e534c31017 100644 --- a/docs/cloud/manage/billing/marketplace/aws-marketplace-committed.md +++ b/docs/cloud/manage/billing/marketplace/aws-marketplace-committed.md @@ -25,69 +25,39 @@ Get started with ClickHouse Cloud on the [AWS Marketplace](https://aws.amazon.co 1. You should have received an email with a link to review and accept your private offer. -
+ AWS Marketplace private offer email -AWS Marketplace private offer email - -
- -2. Click on the **Review Offer** link in the email. This should take you to your AWS Marketplace page with the private offer details. While accepting the private offer, choose a value of 1 for the number of units in the Contract Options picklist. +2. Click on the **Review Offer** link in the email. This should take you to your AWS Marketplace page with the private offer details. While accepting the private offer, choose a value of 1 for the number of units in the Contract Options picklist. 3. Complete the steps to subscribe on the AWS portal and click on **Set up your account**. -It is critical to redirect to ClickHouse Cloud at this point and either register for a new account, or sign in with an existing account. Without completing this step, we will not be able to link your AWS Marketplace subscription to ClickHouse Cloud. + It is critical to redirect to ClickHouse Cloud at this point and either register for a new account, or sign in with an existing account. Without completing this step, we will not be able to link your AWS Marketplace subscription to ClickHouse Cloud. 4. Once you redirect to ClickHouse Cloud, you can either login with an existing account, or register with a new account. This step is very important so we can bind your ClickHouse Cloud organization to the AWS Marketplace billing. -
- -ClickHouse Cloud sign in page - -
- -If you are a new ClickHouse Cloud user, click **Register** at the bottom of the page. You will be prompted to create a new user and verify the email. After verifying your email, you can leave the ClickHouse Cloud login page and login using the new username at the [https://console.clickhouse.cloud](https://console.clickhouse.cloud). - -
- -ClickHouse Cloud sign up page - -
+ ClickHouse Cloud sign in page -Note that if you are a new user, you will also need to provide some basic information about your business. See the screenshots below. + If you are a new ClickHouse Cloud user, click **Register** at the bottom of the page. You will be prompted to create a new user and verify the email. After verifying your email, you can leave the ClickHouse Cloud login page and login using the new username at the [https://console.clickhouse.cloud](https://console.clickhouse.cloud). -
+ ClickHouse Cloud sign up page -ClickHouse Cloud sign up info form + Note that if you are a new user, you will also need to provide some basic information about your business. See the screenshots below. -
+ ClickHouse Cloud sign up info form -
+ ClickHouse Cloud sign up info form 2 -ClickHouse Cloud sign up info form 2 - -
- -If you are an existing ClickHouse Cloud user, simply log in using your credentials. + If you are an existing ClickHouse Cloud user, simply log in using your credentials. 5. After successfully logging in, a new ClickHouse Cloud organization will be created. This organization will be connected to your AWS billing account and all usage will be billed via your AWS account. 6. Once you login, you can confirm that your billing is in fact tied to the AWS Marketplace and start setting up your ClickHouse Cloud resources. -
- -ClickHouse Cloud view AWS Marketplace billing - -
+ ClickHouse Cloud view AWS Marketplace billing -ClickHouse Cloud new services page - -
+ ClickHouse Cloud new services page 6. You should receive an email confirming the sign up: -
- -AWS Marketplace confirmation email - -
+ AWS Marketplace confirmation email -If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). + If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). diff --git a/docs/cloud/manage/billing/marketplace/aws-marketplace-payg.md b/docs/cloud/manage/billing/marketplace/aws-marketplace-payg.md index f81eb94b108..1ea299ae1ce 100644 --- a/docs/cloud/manage/billing/marketplace/aws-marketplace-payg.md +++ b/docs/cloud/manage/billing/marketplace/aws-marketplace-payg.md @@ -30,107 +30,61 @@ Get started with ClickHouse Cloud on the [AWS Marketplace](https://aws.amazon.co 1. Go to the [AWS Marketplace](https://aws.amazon.com/marketplace) and search for ClickHouse Cloud. -
- -AWS Marketplace home page - -
+ AWS Marketplace home page 2. Click on the [listing](https://aws.amazon.com/marketplace/pp/prodview-jettukeanwrfc) and then on **View purchase options**. -
- -AWS Marketplace search for ClickHouse - -
+ AWS Marketplace search for ClickHouse 3. On the next screen, configure the contract: - **Length of contract** - PAYG contracts run month to month. -- **Renewal settings** - You can set the contract to auto renew or not. -Note that if you don't enable auto renewal, your organization is automatically put into a grace period at the end of the billing cycle and then decommissioned. +- **Renewal settings** - You can set the contract to auto renew or not. + Note that if you don't enable auto renewal, your organization is automatically put into a grace period at the end of the billing cycle and then decommissioned. - **Contract options** - You can input any number (or just 1) into this text box. This will not affect the price you pay as the price for these units for the public offer is $0. These units are usually used when accepting a private offer from ClickHouse Cloud. - **Purchase order** - This is optional and you can ignore this. -
- -AWS Marketplace configure contract - -
+ AWS Marketplace configure contract -After filling out the above information, click on **Create Contract**. You can confirm that the contract price displayed is zero dollars which essentially means that you have no payment due and will incur charges based on usage. + After filling out the above information, click on **Create Contract**. You can confirm that the contract price displayed is zero dollars which essentially means that you have no payment due and will incur charges based on usage. -
- -AWS Marketplace confirm contract - -
+ AWS Marketplace confirm contract 4. Once you click **Create Contract**, you will see a modal to confirm and pay ($0 due). 5. Once you click **Pay now**, you will see a confirmation that you are now subscribed to the AWS Marketplace offering for ClickHouse Cloud. -
- -AWS Marketplace payment confirmation - -
+ AWS Marketplace payment confirmation 6. Note that at this point, the setup is not complete yet. You will need to redirect to ClickHouse Cloud by clicking on **Set up your account** and signing up on ClickHouse Cloud. 7. Once you redirect to ClickHouse Cloud, you can either login with an existing account, or register with a new account. This step is very important so we can bind your ClickHouse Cloud organization to the AWS Marketplace billing. -
- -ClickHouse Cloud sign in page - -
+ ClickHouse Cloud sign in page -If you are a new ClickHouse Cloud user, click **Register** at the bottom of the page. You will be prompted to create a new user and verify the email. After verifying your email, you can leave the ClickHouse Cloud login page and login using the new username at the [https://console.clickhouse.cloud](https://console.clickhouse.cloud). + If you are a new ClickHouse Cloud user, click **Register** at the bottom of the page. You will be prompted to create a new user and verify the email. After verifying your email, you can leave the ClickHouse Cloud login page and login using the new username at the [https://console.clickhouse.cloud](https://console.clickhouse.cloud). -
+ ClickHouse Cloud sign up page -ClickHouse Cloud sign up page + Note that if you are a new user, you will also need to provide some basic information about your business. See the screenshots below. -
+ ClickHouse Cloud sign up info form -Note that if you are a new user, you will also need to provide some basic information about your business. See the screenshots below. + ClickHouse Cloud sign up info form 2 -
- -ClickHouse Cloud sign up info form - -
- -
- -ClickHouse Cloud sign up info form 2 - -
- -If you are an existing ClickHouse Cloud user, simply log in using your credentials. + If you are an existing ClickHouse Cloud user, simply log in using your credentials. 8. After successfully logging in, a new ClickHouse Cloud organization will be created. This organization will be connected to your AWS billing account and all usage will be billed via your AWS account. 9. Once you login, you can confirm that your billing is in fact tied to the AWS Marketplace and start setting up your ClickHouse Cloud resources. -
+ ClickHouse Cloud view AWS Marketplace billing -ClickHouse Cloud view AWS Marketplace billing - -
- -ClickHouse Cloud new services page - -
+ ClickHouse Cloud new services page 10. You should receive an email confirming the sign up: -
- -AWS Marketplace confirmation email - -
+ AWS Marketplace confirmation email -If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). + If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). diff --git a/docs/cloud/manage/billing/marketplace/azure-marketplace-committed.md b/docs/cloud/manage/billing/marketplace/azure-marketplace-committed.md index 00c6587d2aa..7ee441687e8 100644 --- a/docs/cloud/manage/billing/marketplace/azure-marketplace-committed.md +++ b/docs/cloud/manage/billing/marketplace/azure-marketplace-committed.md @@ -22,7 +22,6 @@ import azure_marketplace_payg_12 from '@site/static/images/cloud/manage/billing/ Get started with ClickHouse Cloud on the [Azure Marketplace](https://azuremarketplace.microsoft.com/en-us/marketplace/apps) via a committed contract. A committed contract, also known as a a Private Offer, allows customers to commit to spending a certain amount on ClickHouse Cloud over a period of time. - ## Prerequisites {#prerequisites} - A Private Offer from ClickHouse based on specific contract terms. @@ -31,109 +30,56 @@ Get started with ClickHouse Cloud on the [Azure Marketplace](https://azuremarket 1. You should have received an email with a link to review and accept your private offer. -
- -Azure Marketplace private offer email - -
+ Azure Marketplace private offer email 2. Click on the **Review Private Offer** link in the email. This should take you to your GCP Marketplace page with the private offer details. -
- -Azure Marketplace private offer details - -
+ Azure Marketplace private offer details 3. Once you accept the offer, you will be taken to a **Private Offer Management** screen. Azure may take some time to prepare the offer for purchase. -
- -Azure Marketplace Private Offer Management page - -
+ Azure Marketplace Private Offer Management page -Azure Marketplace Private Offer Management page loading - -
+ Azure Marketplace Private Offer Management page loading 4. After a few minutes, refresh the page. The offer should be ready for **Purchase**. -
- -Azure Marketplace Private Offer Management page purchase enabled - -
+ Azure Marketplace Private Offer Management page purchase enabled 5. Click on **Purchase** - you will see a flyout open. Complete the following: -
- -- Subscription and resource group +- Subscription and resource group - Provide a name for the SaaS subscription -- Choose the billing plan that you have a private offer for. Only the term that the private offer was created (for example, 1 year) will have an amount against it. Other billing term options will be for $0 amounts. +- Choose the billing plan that you have a private offer for. Only the term that the private offer was created (for example, 1 year) will have an amount against it. Other billing term options will be for $0 amounts. - Choose whether you want recurring billing or not. If recurring billing is not selected, the contract will end at the end of the billing period and the resources will be set to decommissioned. - Click on **Review + subscribe**. -
- -Azure Marketplace subscription form - -
+ Azure Marketplace subscription form 6. On the next screen, review all the details and hit **Subscribe**. -
- -Azure Marketplace subscription confirmation - -
+ Azure Marketplace subscription confirmation 7. On the next screen, you will see **Your SaaS subscription in progress**. -
- -Azure Marketplace subscription submitting page - -
+ Azure Marketplace subscription submitting page 8. Once ready, you can click on **Configure account now**. Note that is a critical step that binds the Azure subscription to a ClickHouse Cloud organization for your account. Without this step, your Marketplace subscription is not complete. -
- -Azure Marketplace configure account now button - -
+ Azure Marketplace configure account now button 9. You will be redirected to the ClickHouse Cloud sign up or sign in page. You can either sign up using a new account or sign in using an existing account. Once you are signed in, a new organization will be created that is ready to be used and billed via the Azure Marketplace. 10. You will need to answer a few questions - address and company details - before you can proceed. -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form -ClickHouse Cloud sign up info form 2 - -
+ ClickHouse Cloud sign up info form 2 11. Once you hit **Complete sign up**, you will be taken to your organization within ClickHouse Cloud where you can view the billing screen to ensure you are being billed via the Azure Marketplace and can create services. -
- -
- -ClickHouse Cloud sign up info form - -
- -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form -If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). + ClickHouse Cloud sign up info form + If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). diff --git a/docs/cloud/manage/billing/marketplace/azure-marketplace-payg.md b/docs/cloud/manage/billing/marketplace/azure-marketplace-payg.md index 469e1dfa5f9..72373641352 100644 --- a/docs/cloud/manage/billing/marketplace/azure-marketplace-payg.md +++ b/docs/cloud/manage/billing/marketplace/azure-marketplace-payg.md @@ -30,118 +30,60 @@ Get started with ClickHouse Cloud on the [Azure Marketplace](https://azuremarket 1. Go to [Azure Marketplace](https://azuremarketplace.microsoft.com/en-us/marketplace/apps) and search for ClickHouse Cloud. Make sure you are logged in so you can purchase an offering on the marketplace. -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form 2. On the product listing page, click on **Get It Now**. -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form 3. You will need to provide a name, email, and location information on the next screen. -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form 4. On the next screen, click on **Subscribe**. -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form 5. On the next screen, choose the subscription, resource group, and resource group location. The resource group location does not have to be the same location as where you intend to launch your services on ClickHouse Cloud. -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form 6. You will also need to provide a name for the subscription as well as choose the billing term from the available options. You can choose to set **Recurring billing** to on or off. If you set it "off", your contract will end after the billing term ends and your resources will be decommissioned. -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form 7. Click **"Review + subscribe"**. 8. On the next screen, verify that everything looks correct and click **Subscribe**. -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form 9. Note that at this point, you will have subscribed to the Azure subscription of ClickHouse Cloud, but you have not yet set up your account on ClickHouse Cloud. The next steps are necessary and critical for ClickHouse Cloud to be able to bind to your Azure subscription so your billing happens correctly through the Azure marketplace. -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form 10. Once the Azure set up completes, the **Configure account now** button should become active. -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form 11. Click on **Configure account now**. -
- -You will receive an email like the one below with details on configuring your account: + You will receive an email like the one below with details on configuring your account: -
- -ClickHouse Cloud sign up info form - -
+ ClickHouse Cloud sign up info form 12. You will be redirected to the ClickHouse Cloud sign up or sign in page. You can either sign up using a new account or sign in using an existing account. Once you are signed in, a new organization will be created that is ready to be used and billed via the Azure Marketplace. 13. You will need to answer a few questions - address and company details - before you can proceed. -
- -ClickHouse Cloud sign up info form + ClickHouse Cloud sign up info form -
- -ClickHouse Cloud sign up info form 2 - -
+ ClickHouse Cloud sign up info form 2 14. Once you hit **Complete sign up**, you will be taken to your organization within ClickHouse Cloud where you can view the billing screen to ensure you are being billed via the Azure Marketplace and can create services. -
- -
- -ClickHouse Cloud sign up info form - -
- -
- -ClickHouse Cloud sign up info form + ClickHouse Cloud sign up info form -
+ ClickHouse Cloud sign up info form 15. If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). diff --git a/docs/cloud/manage/billing/marketplace/gcp-marketplace-committed.md b/docs/cloud/manage/billing/marketplace/gcp-marketplace-committed.md index 26186a93416..1ee1cdf3871 100644 --- a/docs/cloud/manage/billing/marketplace/gcp-marketplace-committed.md +++ b/docs/cloud/manage/billing/marketplace/gcp-marketplace-committed.md @@ -31,111 +31,56 @@ Get started with ClickHouse Cloud on the [GCP Marketplace](https://console.cloud 1. You should have received an email with a link to review and accept your private offer. -
- -GCP Marketplace private offer email - -
+ GCP Marketplace private offer email 2. Click on the **Review Offer** link in the email. This should take you to your GCP Marketplace page with the private offer details. -
- -GCP Marketplace offer summary - -
+ GCP Marketplace offer summary -GCP Marketplace pricing summary - -
+ GCP Marketplace pricing summary 3. Review the private offer details and if everything is correct, click **Accept**. -
- -GCP Marketplace accept page - -
+ GCP Marketplace accept page 4. Click on **Go to product page**. -
- -GCP Marketplace acceptance confirmation - -
+ GCP Marketplace acceptance confirmation 5. Click on **Manage on provider**. -
- -GCP Marketplace ClickHouse Cloud page - -
- -It is critical to redirect to ClickHouse Cloud at this point and sign up or sign in. Without completing this step, we will not be able to link your GCP Marketplace subscription to ClickHouse Cloud. - -
- -GCP Marketplace leaving website confirmation modal + GCP Marketplace ClickHouse Cloud page -
+ It is critical to redirect to ClickHouse Cloud at this point and sign up or sign in. Without completing this step, we will not be able to link your GCP Marketplace subscription to ClickHouse Cloud. -6. Once you redirect to ClickHouse Cloud, you can either login with an existing account, or register with a new account. + GCP Marketplace leaving website confirmation modal -
+6. Once you redirect to ClickHouse Cloud, you can either login with an existing account, or register with a new account. -ClickHouse Cloud sign in page + ClickHouse Cloud sign in page -
+ If you are a new ClickHouse Cloud user, click **Register** at the bottom of the page. You will be prompted to create a new user and verify the email. After verifying your email, you can leave the ClickHouse Cloud login page and login using the new username at the [https://console.clickhouse.cloud](https://console.clickhouse.cloud). -If you are a new ClickHouse Cloud user, click **Register** at the bottom of the page. You will be prompted to create a new user and verify the email. After verifying your email, you can leave the ClickHouse Cloud login page and login using the new username at the [https://console.clickhouse.cloud](https://console.clickhouse.cloud). + ClickHouse Cloud sign up page -
+ Note that if you are a new user, you will also need to provide some basic information about your business. See the screenshots below. -ClickHouse Cloud sign up page + ClickHouse Cloud sign up info form -
+ ClickHouse Cloud sign up info form 2 -Note that if you are a new user, you will also need to provide some basic information about your business. See the screenshots below. - -
- -ClickHouse Cloud sign up info form - -
- -ClickHouse Cloud sign up info form 2 - -
- -If you are an existing ClickHouse Cloud user, simply log in using your credentials. + If you are an existing ClickHouse Cloud user, simply log in using your credentials. 7. After successfully logging in, a new ClickHouse Cloud organization will be created. This organization will be connected to your GCP billing account and all usage will be billed via your GCP account. 8. Once you login, you can confirm that your billing is in fact tied to the GCP Marketplace and start setting up your ClickHouse Cloud resources. -
- -ClickHouse Cloud sign in page + ClickHouse Cloud sign in page -
- -ClickHouse Cloud new services page - -
+ ClickHouse Cloud new services page 9. You should receive an email confirming the sign up: -
-
- -GCP Marketplace confirmation email - -
- -
- -If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). - + GCP Marketplace confirmation email + If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). diff --git a/docs/cloud/manage/billing/marketplace/gcp-marketplace-payg.md b/docs/cloud/manage/billing/marketplace/gcp-marketplace-payg.md index 7386667d36f..4efd93a6e44 100644 --- a/docs/cloud/manage/billing/marketplace/gcp-marketplace-payg.md +++ b/docs/cloud/manage/billing/marketplace/gcp-marketplace-payg.md @@ -29,12 +29,11 @@ Get started with ClickHouse Cloud on the [GCP Marketplace](https://console.cloud 1. Go to the [GCP Marketplace](https://cloud.google.com/marketplace) and search for ClickHouse Cloud. Make sure you have the correct project chosen. - -GCP Marketplace home page + GCP Marketplace home page 2. Click on the [listing](https://console.cloud.google.com/marketplace/product/clickhouse-public/clickhouse-cloud) and then on **Subscribe**. -ClickHouse Cloud in GCP Marketplace + ClickHouse Cloud in GCP Marketplace 3. On the next screen, configure the subscription: @@ -43,77 +42,40 @@ Get started with ClickHouse Cloud on the [GCP Marketplace](https://console.cloud - Choose the appropriate billing account - Accept the terms and click **Subscribe** -
- -Configure subscription in GCP Marketplace - -
+ Configure subscription in GCP Marketplace 4. Once you click **Subscribe**, you will see a modal **Sign up with ClickHouse**. -
- -GCP Marketplace sign up modal - -
+ GCP Marketplace sign up modal 5. Note that at this point, the setup is not complete yet. You will need to redirect to ClickHouse Cloud by clicking on **Set up your account** and signing up on ClickHouse Cloud. 6. Once you redirect to ClickHouse Cloud, you can either login with an existing account, or register with a new account. This step is very important so we can bind your ClickHouse Cloud organization to the GCP Marketplace billing. -
- -ClickHouse Cloud sign in page - -
- -If you are a new ClickHouse Cloud user, click **Register** at the bottom of the page. You will be prompted to create a new user and verify the email. After verifying your email, you can leave the ClickHouse Cloud login page and login using the new username at the [https://console.clickhouse.cloud](https://console.clickhouse.cloud). - -
- -ClickHouse Cloud sign up page + ClickHouse Cloud sign in page -
+ If you are a new ClickHouse Cloud user, click **Register** at the bottom of the page. You will be prompted to create a new user and verify the email. After verifying your email, you can leave the ClickHouse Cloud login page and login using the new username at the [https://console.clickhouse.cloud](https://console.clickhouse.cloud). -Note that if you are a new user, you will also need to provide some basic information about your business. See the screenshots below. + ClickHouse Cloud sign up page -
+ Note that if you are a new user, you will also need to provide some basic information about your business. See the screenshots below. -ClickHouse Cloud sign up info form + ClickHouse Cloud sign up info form -
+ ClickHouse Cloud sign up info form 2 -ClickHouse Cloud sign up info form 2 - -
- -If you are an existing ClickHouse Cloud user, simply log in using your credentials. + If you are an existing ClickHouse Cloud user, simply log in using your credentials. 7. After successfully logging in, a new ClickHouse Cloud organization will be created. This organization will be connected to your GCP billing account and all usage will be billed via your GCP account. 8. Once you login, you can confirm that your billing is in fact tied to the GCP Marketplace and start setting up your ClickHouse Cloud resources. -
- -ClickHouse Cloud sign in page - -
- -ClickHouse Cloud new services page + ClickHouse Cloud sign in page -
+ ClickHouse Cloud new services page 9. You should receive an email confirming the sign up: -
-
- -GCP Marketplace confirmation email - -
- -
- -If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). - + GCP Marketplace confirmation email + If you run into any issues, please do not hesitate to contact [our support team](https://clickhouse.com/support/program). diff --git a/docs/cloud/manage/billing/marketplace/overview.md b/docs/cloud/manage/billing/marketplace/overview.md index 1283206ecc3..97b79188999 100644 --- a/docs/cloud/manage/billing/marketplace/overview.md +++ b/docs/cloud/manage/billing/marketplace/overview.md @@ -31,13 +31,13 @@ Signing up for ClickHouse Cloud from the cloud provider marketplace is a two ste 1. You first "subscribe" to ClickHouse Cloud on the cloud providers' marketplace portal. After you have finished subscribing, you click on "Pay Now" or "Manage on Provider" (depending on the marketplace). This redirects you to ClickHouse Cloud. 2. On Clickhouse Cloud you either register for a new account, or sign in with an existing account. Either way, a new ClickHouse Cloud organization will be created for you which is tied to your marketplace billing. -NOTE: Your existing services and organizations from any prior ClickHouse Cloud signups will remain and they will not be connected to the marketplace billing. ClickHouse Cloud allows you to use the same account to manage multiple organization, each with different billing. + NOTE: Your existing services and organizations from any prior ClickHouse Cloud signups will remain and they will not be connected to the marketplace billing. ClickHouse Cloud allows you to use the same account to manage multiple organization, each with different billing. -You can switch between organizations from the bottom left menu of the ClickHouse Cloud console. + You can switch between organizations from the bottom left menu of the ClickHouse Cloud console. ### I am an existing ClickHouse Cloud user. What should I do if I want my existing services to be billed via marketplace?​ {#i-am-an-existing-clickhouse-cloud-user-what-should-i-do-if-i-want-my-existing-services-to-be-billed-via-marketplace} -You will need to subscribe to ClickHouse Cloud via the cloud provider marketplace. Once you finish subscribing on the marketplace, and redirect to ClickHouse Cloud you will have the option of linking an existing ClickHouse Cloud organization to marketplace billing. From that point on, your existing resources will now get billed via the marketplace. +You will need to subscribe to ClickHouse Cloud via the cloud provider marketplace. Once you finish subscribing on the marketplace, and redirect to ClickHouse Cloud you will have the option of linking an existing ClickHouse Cloud organization to marketplace billing. From that point on, your existing resources will now get billed via the marketplace. Marketplace signup and org linking @@ -60,7 +60,7 @@ In that case please subscribe to the ClickHouse Cloud as usual (see sections on - For AWS marketplace a new ClickHouse Cloud organization will be created and connected to the marketplace. - For the GCP marketplace your old organization will be reactivated. -If you have any trouble with reactivating your marketplace org, please contact [ClickHouse Cloud Support](https://clickhouse.com/support/program). + If you have any trouble with reactivating your marketplace org, please contact [ClickHouse Cloud Support](https://clickhouse.com/support/program). ### How do I access my invoice for my marketplace subscription to the ClickHouse Cloud service?​ {#how-do-i-access-my-invoice-for-my-marketplace-subscription-to-the-clickhouse-cloud-service} @@ -89,4 +89,4 @@ A single ClickHouse organization can only be configured to bill to a single Clou ### If my ClickHouse Organization is billed through a cloud marketplace committed spend agreement will I automatically move to PAYG billing when I run out of credits? {#automatically-move-to-PAYG-when-running-out-of-credit} -If your marketplace committed spend contract is active and you run out of credits we will automatically move your organization to PAYG billing. However, when your existing contract expires, you will need to link a new marketplace contract to your organization or move your organization to direct billing via credit card. +If your marketplace committed spend contract is active and you run out of credits we will automatically move your organization to PAYG billing. However, when your existing contract expires, you will need to link a new marketplace contract to your organization or move your organization to direct billing via credit card. diff --git a/docs/cloud/manage/billing/payment-thresholds.md b/docs/cloud/manage/billing/payment-thresholds.md index 0c2b6948d0e..2b92b6b23c0 100644 --- a/docs/cloud/manage/billing/payment-thresholds.md +++ b/docs/cloud/manage/billing/payment-thresholds.md @@ -8,7 +8,7 @@ keywords: ['billing', 'payment thresholds', 'automatic invoicing', 'invoice'] # Payment thresholds -When your amount due in a billing period for ClickHouse Cloud reaches $10,000 USD or the equivalent value, your payment method will be automatically charged. A failed charge will result in the suspension or termination of your services after a grace period. +When your amount due in a billing period for ClickHouse Cloud reaches $10,000 USD or the equivalent value, your payment method will be automatically charged. A failed charge will result in the suspension or termination of your services after a grace period. :::note This payment threshold does not apply to customers who have a committed spend contract or other negotiated contractual agreement with ClickHouse. diff --git a/docs/cloud/manage/cloud-tiers.md b/docs/cloud/manage/cloud-tiers.md index 86c6227f739..26344a1fec8 100644 --- a/docs/cloud/manage/cloud-tiers.md +++ b/docs/cloud/manage/cloud-tiers.md @@ -7,7 +7,7 @@ description: 'Cloud tiers available in ClickHouse Cloud' # ClickHouse Cloud tiers -There are several tiers available in ClickHouse Cloud. +There are several tiers available in ClickHouse Cloud. Tiers are assigned at any organizational level. Services within an organization therefore belong to the same tier. This page discusses which tiers are right for your specific use case. @@ -162,19 +162,19 @@ This page discusses which tiers are right for your specific use case. - Cost-effective option that supports single-replica deployments. - Ideal for departmental use cases with smaller data volumes that do not have hard reliability guarantees. -:::note -Services in the basic tier are meant to be fixed in size and do not allow scaling, both automatic and manual. -Users can upgrade to the Scale or Enterprise tier to scale their services. -::: + :::note + Services in the basic tier are meant to be fixed in size and do not allow scaling, both automatic and manual. + Users can upgrade to the Scale or Enterprise tier to scale their services. + ::: ## Scale {#scale} Designed for workloads requiring enhanced SLAs (2+ replica deployments), scalability, and advanced security. -- Offers support for features such as: - - [Private networking support](../security/private-link-overview.md). - - [Compute-compute separation](../reference/warehouses#what-is-compute-compute-separation). - - [Flexible scaling](../manage/scaling.md) options (scale up/down, in/out). +- Offers support for features such as: + - [Private networking support](../security/private-link-overview.md). + - [Compute-compute separation](../reference/warehouses#what-is-compute-compute-separation). + - [Flexible scaling](../manage/scaling.md) options (scale up/down, in/out). ## Enterprise {#enterprise} @@ -184,21 +184,20 @@ Caters to large-scale, mission critical deployments that have stringent security - Flexible scaling: standard profiles (`1:4 vCPU:memory ratio`), as well as `HighMemory (1:8 ratio)` and `HighCPU (1:2 ratio)` custom profiles. - Provides the highest levels of performance and reliability guarantees. - Supports enterprise-grade security: - - Single Sign On (SSO) - - Enhanced Encryption: For AWS and GCP services. Services are encrypted by our key by default and can be rotated to their key to enable Customer Managed Encryption Keys (CMEK). -- Allows Scheduled upgrades: Users can select the day of the week/time window for upgrades, both database and cloud releases. + - Single Sign On (SSO) + - Enhanced Encryption: For AWS and GCP services. Services are encrypted by our key by default and can be rotated to their key to enable Customer Managed Encryption Keys (CMEK). +- Allows Scheduled upgrades: Users can select the day of the week/time window for upgrades, both database and cloud releases. - Offers [HIPAA](../security/compliance-overview.md/#hipaa-since-2024) Compliance. - Exports Backups to the user's account. -:::note -Single replica services across all three tiers are meant to be fixed in size (`8 GiB`, `12 GiB`) -::: + :::note + Single replica services across all three tiers are meant to be fixed in size (`8 GiB`, `12 GiB`) + ::: ## Upgrading to a different tier {#upgrading-to-a-different-tier} You can always upgrade from Basic to Scale or from Scale to Enterprise. Downgrading tiers will require disabling premium features. - --- If you have any questions about service types, please see the [pricing page](https://clickhouse.com/pricing) or contact support@clickhouse.com. diff --git a/docs/cloud/manage/dashboards.md b/docs/cloud/manage/dashboards.md index c33953aa82b..a06eb877619 100644 --- a/docs/cloud/manage/dashboards.md +++ b/docs/cloud/manage/dashboards.md @@ -26,23 +26,23 @@ The SQL Console's dashboards feature allows you to collect and share visualizati ### Query sharing {#query-sharing} -In order to share your dashboard with colleagues, please be sure to share the underlying saved query. To view a visualization, users must have, at a minimum, read-only access to the underlying saved query. +In order to share your dashboard with colleagues, please be sure to share the underlying saved query. To view a visualization, users must have, at a minimum, read-only access to the underlying saved query. ### Interactivity {#interactivity} -Use [query parameters](/sql-reference/syntax#defining-and-using-query-parameters) to make your dashboard interactive. For instance, you can add a query parameter to a `WHERE` clause to function as a filter. +Use [query parameters](/sql-reference/syntax#defining-and-using-query-parameters) to make your dashboard interactive. For instance, you can add a query parameter to a `WHERE` clause to function as a filter. -You can toggle the query parameter input via the **Global** filters side pane by selecting a “filter” type in the visualization settings. You can also toggle the query parameter input by linking to another object (like a table) on the dashboard. Please see the “[configure a filter](/cloud/manage/dashboards#configure-a-filter)” section of the quick start guide below. +You can toggle the query parameter input via the **Global** filters side pane by selecting a “filter” type in the visualization settings. You can also toggle the query parameter input by linking to another object (like a table) on the dashboard. Please see the “[configure a filter](/cloud/manage/dashboards#configure-a-filter)” section of the quick start guide below. ## Quick start {#quick-start} -Let's create a dashboard to monitor our ClickHouse service using the [query\_log](/operations/system-tables/query_log) system table. +Let's create a dashboard to monitor our ClickHouse service using the [query\_log](/operations/system-tables/query_log) system table. ## Quick start {#quick-start-1} ### Create a saved query {#create-a-saved-query} -If you already have saved queries to visualize, you can skip this step. +If you already have saved queries to visualize, you can skip this step. Open a new query tab. Let's write a query to count query volume by day on a service using ClickHouse system tables: @@ -54,11 +54,11 @@ We can view the results of the query in table format or start building visualiza More documentation around saved queries can be found in the [Saving a Query section](/cloud/get-started/sql-console#saving-a-query). -We can create and save another query, `query count by query kind`, to count the number of queries by query kind. Here's a bar chart visualization of the data in the SQL console. +We can create and save another query, `query count by query kind`, to count the number of queries by query kind. Here's a bar chart visualization of the data in the SQL console. A bar chart visualization of a query's results -Now that there's two queries, let's create a dashboard to visualize and collect these queries. +Now that there's two queries, let's create a dashboard to visualize and collect these queries. ### Create a dashboard {#create-a-dashboard} @@ -72,9 +72,9 @@ There's two saved queries, `queries over time` and `query count by query kind`. Add a visualization -Here, additional stylistic changes can also be made - like number formatting, legend layout, and axis labels. +Here, additional stylistic changes can also be made - like number formatting, legend layout, and axis labels. -Next, let's visualize the second query as a table, and position it below the line chart. +Next, let's visualize the second query as a table, and position it below the line chart. Visualize query results as a table @@ -82,22 +82,22 @@ You've created your first dashboard by visualizing two saved queries! ### Configure a filter {#configure-a-filter} -Let's make this dashboard interactive by adding a filter on query kind so you can display just the trends related to Insert queries. We'll accomplish this task using [query parameters](/sql-reference/syntax#defining-and-using-query-parameters). +Let's make this dashboard interactive by adding a filter on query kind so you can display just the trends related to Insert queries. We'll accomplish this task using [query parameters](/sql-reference/syntax#defining-and-using-query-parameters). -Click on the three dots next to the line chart, and click on the pencil button next to the query to open the in-line query editor. Here, we can edit the underlying saved query directly from the dashboard. +Click on the three dots next to the line chart, and click on the pencil button next to the query to open the in-line query editor. Here, we can edit the underlying saved query directly from the dashboard. Edit the underlying query -Now, when the yellow run query button is pressed, you'll see the same query from earlier filtered on just insert queries. Click on the save button to update the query. When you return to the chart settings, you'll be able to filter the line chart. +Now, when the yellow run query button is pressed, you'll see the same query from earlier filtered on just insert queries. Click on the save button to update the query. When you return to the chart settings, you'll be able to filter the line chart. -Now, using Global Filters on the top ribbon, you can toggle the filter by changing the input. +Now, using Global Filters on the top ribbon, you can toggle the filter by changing the input. Adjust global filters -Suppose you want to link the line chart's filter to the table. You can do this by going back to the visualization settings, and changing the query_kind query parameter' value source to a table, and selecting the query_kind column as the field to link. +Suppose you want to link the line chart's filter to the table. You can do this by going back to the visualization settings, and changing the query_kind query parameter' value source to a table, and selecting the query_kind column as the field to link. Changing query parameter -Now, you can control the filter on the line chart directly from the queries by kind table to make your dashboard interactive. +Now, you can control the filter on the line chart directly from the queries by kind table to make your dashboard interactive. Control the filter on the line chart diff --git a/docs/cloud/manage/index.md b/docs/cloud/manage/index.md index 46c407d0c6b..7e9202d58be 100644 --- a/docs/cloud/manage/index.md +++ b/docs/cloud/manage/index.md @@ -6,7 +6,7 @@ hide_title: true description: 'Overview page for Managing Cloud' --- -# Managing Cloud +# Managing Cloud In this section of the docs you will find all the information you may need about managing ClickHouse cloud. This section contains the following pages: diff --git a/docs/cloud/manage/integrations.md b/docs/cloud/manage/integrations.md index 67e562aa23d..7f4472b33e7 100644 --- a/docs/cloud/manage/integrations.md +++ b/docs/cloud/manage/integrations.md @@ -15,7 +15,6 @@ Besides the dozens of integrations available for ClickHouse, there are also some [ClickPipes](/integrations/clickpipes) is a managed integration platform to ingest data into ClickHouse Cloud using a simple, web-based UI. It currently supports Apache Kafka, S3, GCS and Amazon Kinesis, with more integrations coming soon. - ### Looker Studio for ClickHouse Cloud {#looker-studio-for-clickhouse-cloud} [Looker Studio](https://lookerstudio.google.com/) is a popular business intelligence tool provided by Google. Looker Studio does not currently provide a ClickHouse connector but instead relies on the MySQL wire protocol to connect to ClickHouse. diff --git a/docs/cloud/manage/jan2025_faq/_snippets/_clickpipes_faq.md b/docs/cloud/manage/jan2025_faq/_snippets/_clickpipes_faq.md index 215982c8b11..e1ef7726052 100644 --- a/docs/cloud/manage/jan2025_faq/_snippets/_clickpipes_faq.md +++ b/docs/cloud/manage/jan2025_faq/_snippets/_clickpipes_faq.md @@ -4,142 +4,100 @@ import clickpipesPricingFaq2 from '@site/static/images/cloud/manage/jan2025_faq/ import clickpipesPricingFaq3 from '@site/static/images/cloud/manage/jan2025_faq/external_clickpipes_pricing_faq_3.png';
- Why are we introducing a pricing model for ClickPipes now? - -We decided to initially launch ClickPipes for free with the idea to gather +We decided to initially launch ClickPipes for free with the idea to gather feedback, refine features, and ensure it meets user needs. -As the GA platform has grown, it has effectively stood the test of time by -moving trillions of rows. Introducing a pricing model allows us to continue +As the GA platform has grown, it has effectively stood the test of time by +moving trillions of rows. Introducing a pricing model allows us to continue improving the service, maintaining the infrastructure, and providing dedicated support and new connectors. -
- What are ClickPipes replicas? - ClickPipes ingests data from remote data sources via a dedicated infrastructure that runs and scales independently of the ClickHouse Cloud service. For this reason, it uses dedicated compute replicas. The diagrams below show a simplified architecture. - For streaming ClickPipes, ClickPipes replicas access the remote data sources (e.g., a Kafka broker), pull the data, process and ingest it into the destination ClickHouse service. - ClickPipes Replicas - Streaming ClickPipes - In the case of object storage ClickPipes, the ClickPipes replica orchestrates the data loading task (identifying files to copy, maintaining the state, and moving partitions), while the data is pulled directly from the ClickHouse service. - ClickPipes Replicas - Object Storage ClickPipes -
- What's the default number of replicas and their size? - Each ClickPipe defaults to 1 replica that's provided with 2 GiB of RAM and 0.5 vCPU. This corresponds to **0.25** ClickHouse compute units (1 unit = 8 GiB RAM, 2 vCPUs). -
- Can ClickPipes replicas be scaled? - ClickPipes for streaming can be scaled horizontally by adding more replicas each with a base unit of **0.25** ClickHouse compute units. Vertical scaling is also available on demand for specific use cases (adding more CPU and RAM per replica). -
- How many ClickPipes replicas do I need? - It depends on the workload throughput and latency requirements. We recommend starting with the default value of 1 replica, measuring your latency, and adding replicas if needed. Keep in mind that for Kafka ClickPipes, you also have to scale the Kafka broker partitions accordingly. The scaling controls are available under "settings" for each streaming ClickPipe. - ClickPipes Replicas - How many ClickPipes replicas do I need? -
- What does the ClickPipes pricing structure look like? - It consists of two dimensions: - **Compute**: Price per unit per hour - Compute represents the cost of running the ClickPipes replica pods whether they actively ingest data or not. - It applies to all ClickPipes types. +Compute represents the cost of running the ClickPipes replica pods whether they actively ingest data or not. +It applies to all ClickPipes types. - **Ingested data**: per GB pricing - The ingested data rate applies to all streaming ClickPipes - (Kafka, Confluent, Amazon MSK, Amazon Kinesis, Redpanda, WarpStream, - Azure Event Hubs) for the data transferred via the replica pods. - The ingested data size (GB) is charged based on bytes received from the source (uncompressed or compressed). - +The ingested data rate applies to all streaming ClickPipes +(Kafka, Confluent, Amazon MSK, Amazon Kinesis, Redpanda, WarpStream, +Azure Event Hubs) for the data transferred via the replica pods. +The ingested data size (GB) is charged based on bytes received from the source (uncompressed or compressed).
- What are the ClickPipes public prices? - - Compute: \$0.20 per unit per hour ($0.05 per replica per hour) - Ingested data: $0.04 per GB -
- How does it look in an illustrative example? - For example, ingesting 1 TB of data over 24 hours using the Kafka connector using a single replica (0.25 compute unit) costs: - $$ (0.25 \times 0.20 \times 24) + (0.04 \times 1000) = \$41.2 $$ -
- For object storage connectors (S3 and GCS), only the ClickPipes compute cost is incurred since the ClickPipes pod is not processing data but only orchestrating the transfer which is operated by the underlying ClickHouse service: - $$ 0.25 \times 0,20 \times 24 = \$1.2 $$ -
- When does the new pricing model take effect? - The new pricing model takes effect for all organizations created after January 27th, 2025. -
- What happens to current users? - Existing users will have a **60-day grace period** where the ClickPipes service continues to be offered for free. Billing will automatically start for ClickPipes for existing users on **March 24th, 2025.** -
- How does ClickPipes pricing compare to the market? - The philosophy behind ClickPipes pricing is to cover the operating costs of the platform while offering an easy and reliable way to move data to ClickHouse Cloud. From that angle, our market analysis revealed that we are positioned competitively. - -
\ No newline at end of file + diff --git a/docs/cloud/manage/jan2025_faq/billing.md b/docs/cloud/manage/jan2025_faq/billing.md index a8be3fb2a32..a1ac69dcad6 100644 --- a/docs/cloud/manage/jan2025_faq/billing.md +++ b/docs/cloud/manage/jan2025_faq/billing.md @@ -15,12 +15,12 @@ Some notable changes: - Storage price per TB will be reduced, and storage cost will no longer include backups (we will charge for them separately and will make only one backup required). Storage costs are the same across tiers and vary by region and cloud service provider. - Compute costs will vary by tier, region, and cloud service provider. -- The new pricing dimension for data transfer is applicable for data egress across regions and on the public internet only. -- New pricing dimension for ClickPipes usage. +- The new pricing dimension for data transfer is applicable for data egress across regions and on the public internet only. +- New pricing dimension for ClickPipes usage. ### What happens to users with existing committed spend contracts? {#what-happens-to-users-with-existing-committed-spend-contracts} -Users with active committed spend contracts will not be affected by the new per-dimension unit cost prices for compute and storage until their contract expires. However, the new pricing dimensions for data transfer and ClickPipes will be applicable starting March 24, 2025. Most customers will not see a significant increase in their monthly bill from these new dimensions. +Users with active committed spend contracts will not be affected by the new per-dimension unit cost prices for compute and storage until their contract expires. However, the new pricing dimensions for data transfer and ClickPipes will be applicable starting March 24, 2025. Most customers will not see a significant increase in their monthly bill from these new dimensions. ### Can users on a committed spend agreement with ClickHouse continue to launch services on the old plan? {#can-users-on-a-committed-spend-agreement-with-clickhouse-continue-to-launch-services-on-the-old-plan} @@ -38,12 +38,10 @@ Users on a monthly PAYG plan will continue to be billed using the old pricing pl ### Where can I reference legacy plans? {#where-can-i-reference-legacy-plans} -Legacy plans are available for reference [here](https://clickhouse.com/pricing?legacy=true). +Legacy plans are available for reference [here](https://clickhouse.com/pricing?legacy=true). ## Marketplaces {#marketplaces} ### Are there changes to how users are charged via the CSP marketplaces? {#are-there-changes-to-how-users-are-charged-via-the-csp-marketplaces} Users who sign up to ClickHouse Cloud via a CSP Marketplace incur usage in terms of CHCs (ClickHouse Cloud Credits). This behavior has not changed. However, the underlying composition of the credit usage will align with the pricing and packaging changes outlined here and include charges for any data transfer usage and ClickPipes once those are live. - - diff --git a/docs/cloud/manage/jan2025_faq/dimensions.md b/docs/cloud/manage/jan2025_faq/dimensions.md index c4dd9268593..4afb5b4d8f2 100644 --- a/docs/cloud/manage/jan2025_faq/dimensions.md +++ b/docs/cloud/manage/jan2025_faq/dimensions.md @@ -27,7 +27,7 @@ Data transfer and ClickPipes pricing doesn't apply to legacy plans, i.e. Develop - Data transfer pricing does **not** vary between organizational tiers. - Public egress pricing is based only on the origin region. Inter-region (or cross-region) pricing depends on both the origin and destination regions. - + ### Will data transfer pricing be tiered as usage increases? {#will-data-transfer-pricing-be-tiered-as-usage-increases} diff --git a/docs/cloud/manage/jan2025_faq/index.md b/docs/cloud/manage/jan2025_faq/index.md index 840e07c06e7..b473e48f08f 100644 --- a/docs/cloud/manage/jan2025_faq/index.md +++ b/docs/cloud/manage/jan2025_faq/index.md @@ -5,9 +5,9 @@ description: 'Index page for new pricing FAQ' keywords: ['new pricing', 'faq'] --- - diff --git a/docs/cloud/manage/jan2025_faq/new_tiers.md b/docs/cloud/manage/jan2025_faq/new_tiers.md index f921b7942c9..e1b2d72fb82 100644 --- a/docs/cloud/manage/jan2025_faq/new_tiers.md +++ b/docs/cloud/manage/jan2025_faq/new_tiers.md @@ -14,7 +14,6 @@ description: 'Description of new tiers and features' - **Enhanced Encryption:** This feature is available in Enterprise tier services, including for single replica services, in AWS and GCP. Services are encrypted by our key by default and can be rotated to their key to enable Customer Managed Encryption Keys (CMEK). - **Single Sign On (SSO):** This feature is offered in Enterprise tier and requires a support ticket to be enabled for an Organization. Users who have multiple Organizations should ensure all of their organizations are on the Enterprise tier to use SSO for each organization. - ## Basic tier {#basic-tier} ### What are the considerations for the Basic tier? {#what-are-the-considerations-for-the-basic-tier} @@ -41,7 +40,7 @@ No, services on this tier are meant to support workloads that are small and fixe ### Which tiers on the new plans (Basic/Scale/Enterprise) support compute-compute separation? {#which-tiers-on-the-new-plans-basicscaleenterprise-support-compute-compute-separation} -Only Scale and Enterprise tiers support compute-compute separation. Please also note that this capability requires running at least a 2+ replica parent service. +Only Scale and Enterprise tiers support compute-compute separation. Please also note that this capability requires running at least a 2+ replica parent service. ### Can users on the legacy plans (Production/Development) access compute-compute separation? {#can-users-on-the-legacy-plans-productiondevelopment-access-compute-compute-separation} @@ -51,7 +50,7 @@ Compute-compute separation is not supported on existing Development and Producti ### What different hardware profiles are supported for the Enterprise tier? {#what-different-hardware-profiles-are-supported-for-the-enterprise-tier} -The enterprise tier will support standard profiles (1:4 vCPU:memory ratio), as well as `highMem (1:8 ratio)` and `highCPU (1:2 ratio)` **custom profiles,** offering users more flexibility to select the configuration that best suits their needs. The Enterprise Tier will use shared compute resources deployed alongside the Basic and Scale tiers. +The enterprise tier will support standard profiles (1:4 vCPU:memory ratio), as well as `highMem (1:8 ratio)` and `highCPU (1:2 ratio)` **custom profiles,** offering users more flexibility to select the configuration that best suits their needs. The Enterprise Tier will use shared compute resources deployed alongside the Basic and Scale tiers. ### What are the features exclusively offered on the Enterprise tier? {#what-are-the-features-exclusively-offered-on-the-enterprise-tier} @@ -59,9 +58,7 @@ The enterprise tier will support standard profiles (1:4 vCPU:memory ratio), as w - **Enterprise-grade security:** - **Single Sign On (SSO**) - **Enhanced Encryption:** For AWS and GCP services. Services are encrypted by our key by default and can be rotated to their key to enable Customer Managed Encryption Keys (CMEK). -- **Scheduled upgrades:** Users can select the day of the week/time window for upgrades, both database and cloud releases. +- **Scheduled upgrades:** Users can select the day of the week/time window for upgrades, both database and cloud releases. - **HIPAA Compliance:** The customer must sign a Business Associate Agreement (BAA) through Legal before we enable HIPAA-compliant regions for them. - **Private Regions:** It is not self-serve enabled and will need users to route requests through Sales sales@clickhouse.com. - **Export Backups** to the customer's cloud account. - - diff --git a/docs/cloud/manage/jan2025_faq/plan_migrations.md b/docs/cloud/manage/jan2025_faq/plan_migrations.md index 1e839228614..49bdcec6992 100644 --- a/docs/cloud/manage/jan2025_faq/plan_migrations.md +++ b/docs/cloud/manage/jan2025_faq/plan_migrations.md @@ -56,19 +56,17 @@ Please note that as part of this change, historical system table data will be re ### How will users be guided during migration, understanding what tier best fits their needs? {#how-will-users-be-guided-during-migration-understanding-what-tier-best-fits-their-needs} -The console will prompt you with recommended options for each service based on historical use if you have a service. New users can review the capabilities and features listed in detail and decide on the tier that best suits their needs. +The console will prompt you with recommended options for each service based on historical use if you have a service. New users can review the capabilities and features listed in detail and decide on the tier that best suits their needs. ### How do users size and estimate the cost of "warehouses" in the new pricing? {#how-do-users-size-and-estimate-the-cost-of-warehouses-in-the-new-pricing} Please refer to the pricing calculator on the [Pricing](https://clickhouse.com/pricing) page, which will help estimate the cost based on your workload size and tier selection. - ## Undertaking the migration {#undertaking-the-migration} - ### What are service version pre-requisites to undertaking the migration? {#what-are-service-version-pre-requisites-to-undertaking-the-migration} -Your service has to be on version 24.8 or later and already migrated to SharedMergeTree. +Your service has to be on version 24.8 or later and already migrated to SharedMergeTree. ### What is the migration experience for users of the current Development and Production services? Do users need to plan for a maintenance window where the service is unavailable? {#what-is-the-migration-experience-for-users-of-the-current-development-and-production-services-do-users-need-to-plan-for-a-maintenance-window-where-the-service-is-unavailable} @@ -80,11 +78,11 @@ API access patterns will be different. Users that use our OpenAPI to create new services will be required to remove the `tier` field in the service creation `POST` request. -The `tier` field has been removed from the service object as we no longer have service tiers. +The `tier` field has been removed from the service object as we no longer have service tiers. This will affect the objects returned by the `POST`, `GET`, and `PATCH` service requests. Therefore, any code that consumes these APIs may need to be adjusted to handle these changes. The number of replicas each service will be created with defaults to 3 for the Scale and Enterprise tiers, while it defaults to 1 for the Basic tier. -For the Scale and the Enterprise tiers it is possible to adjust it by passing a `numReplicas` field in the service creation request. +For the Scale and the Enterprise tiers it is possible to adjust it by passing a `numReplicas` field in the service creation request. The value of the `numReplicas` field must be between 2 and 20 for the first service in a warehouse. Services that are created in an existing warehouse can have a number of replicas as low as 1. ### What changes should the users make if using the existing Terraform provider for automation? {#what-changes-should-the-users-make-if-using-the-existing-terraform-provider-for-automation} @@ -98,7 +96,7 @@ After the migration, the `tier` field is no longer accepted, and references to i Users will also be able to specify the `num_replicas` field as a property of the service resource. The number of replicas each service will be created with defaults to 3 for the Scale and Enterprise tiers, while it defaults to 1 for the Basic tier. -For the Scale and the Enterprise tiers, it is possible to adjust it by passing a `numReplicas` field in the service creation request. +For the Scale and the Enterprise tiers, it is possible to adjust it by passing a `numReplicas` field in the service creation request. The value of the `num_replicas` filed must be between 2 and 20 for the first service in a warehouse. Services that are created in an existing warehouse can have a number of replicas as low as 1. ### Will users have to make any changes to the database access? {#will-users-have-to-make-any-changes-to-the-database-access} diff --git a/docs/cloud/manage/jan2025_faq/scaling.md b/docs/cloud/manage/jan2025_faq/scaling.md index 4fa38ed113b..41fc0f97eb8 100644 --- a/docs/cloud/manage/jan2025_faq/scaling.md +++ b/docs/cloud/manage/jan2025_faq/scaling.md @@ -12,18 +12,18 @@ ClickHouse Cloud allows scaling in both directions - vertical (increasing replic The scaling behavior per tier is as follows: * **Basic**: Basic tier supports only single replica services. These services are meant to be fixed in size and do not allow vertical or horizontal scaling. Users can upgrade to the Scale or Enterprise tier to scale their services. -* **Scale**: Scale tier supports single and multi-replica services. Scaling will be permitted for Multi-replica services. +* **Scale**: Scale tier supports single and multi-replica services. Scaling will be permitted for Multi-replica services. * Services can vertically scale to the maximum replica size supported for a CSP/region AFTER they have scaled to a multi-replica setup; only 2+ replicas can be vertically scaled. - * Manual horizontal scaling will be available. -* **Enterprise**: Enterprise tier supports single and multi-replica services, and scaling will be permitted for Multi-replica services + * Manual horizontal scaling will be available. +* **Enterprise**: Enterprise tier supports single and multi-replica services, and scaling will be permitted for Multi-replica services * Services can vertically scale to maximum replica sizes supported for a CSP/region. * Standard profiles (1:4 CPU to memory ratio) will support vertical auto-scaling * Custom profiles (`highMemory` and `highCPU`) can be scaled vertically through a support ticket. * Manual horizontal scaling will be available. -:::note -Services can scale horizontally to a maximum of 20 replicas. If you need additional replicas, please contact our support team. -::: + :::note + Services can scale horizontally to a maximum of 20 replicas. If you need additional replicas, please contact our support team. + ::: ## Can users scale in their service? {#can-users-scale-in-their-service} @@ -34,4 +34,3 @@ Scaling in will be restricted to 2+ replicas. Once scaled out, users will not be We are introducing a new vertical scaling mechanism for compute replicas, which we call "Make Before Break" (MBB). This approach adds one or more replicas of the new size before removing the old replicas, preventing any loss of capacity during scaling operations. By eliminating the gap between removing existing replicas and adding new ones, MBB creates a more seamless and less disruptive scaling process. It is especially beneficial in scale-up scenarios, where high resource utilization triggers the need for additional capacity, since removing replicas prematurely would only exacerbate the resource constraints. Please note that as part of this change, historical system table data will be retained for up to a maximum of 30 days as part of scaling events. In addition, any system table data older than December 19, 2024, for services on AWS or GCP and older than January 14, 2025, for services on Azure will not be retained as part of the migration to the new organization tiers. - diff --git a/docs/cloud/manage/jan2025_faq/summary.md b/docs/cloud/manage/jan2025_faq/summary.md index dfeafe642d3..da2ce49d599 100644 --- a/docs/cloud/manage/jan2025_faq/summary.md +++ b/docs/cloud/manage/jan2025_faq/summary.md @@ -9,83 +9,83 @@ The following FAQ summarizes common questions with respect to new tiers introduc ## What has changed with ClickHouse Cloud tiers? {#what-has-changed-with-clickhouse-cloud-tiers} -At ClickHouse, we are dedicated to adapting our products to meet the ever-changing requirements of our customers. Since its introduction in GA over the past two years, ClickHouse Cloud has evolved substantially, and we've gained invaluable insights into how our customers leverage our cloud offerings. +At ClickHouse, we are dedicated to adapting our products to meet the ever-changing requirements of our customers. Since its introduction in GA over the past two years, ClickHouse Cloud has evolved substantially, and we've gained invaluable insights into how our customers leverage our cloud offerings. We are introducing new features to optimize the sizing and cost-efficiency of ClickHouse Cloud services for your workloads. These include compute-compute separation, high-performance machine types, and single-replica services. We are also evolving automatic scaling and managed upgrades to execute in a more seamless and reactive fashion. -We are adding a new Enterprise tier to serve the needs of the most demanding customers and workloads, with focus on industry-specific security and compliance features, even more controls over underlying hardware and upgrades, and advanced disaster recovery features. +We are adding a new Enterprise tier to serve the needs of the most demanding customers and workloads, with focus on industry-specific security and compliance features, even more controls over underlying hardware and upgrades, and advanced disaster recovery features. -You can read about these and other functional changes in this [blog](https://clickhouse.com/blog/evolution-of-clickhouse-cloud-new-features-superior-performance-tailored-offerings). +You can read about these and other functional changes in this [blog](https://clickhouse.com/blog/evolution-of-clickhouse-cloud-new-features-superior-performance-tailored-offerings). ## What action is required? {#what-action-is-required} -To support these changes, we are restructuring our current tiers to more closely match how our evolving customer base is using our offerings, and you need to take action to select a new plan. +To support these changes, we are restructuring our current tiers to more closely match how our evolving customer base is using our offerings, and you need to take action to select a new plan. -Details and timelines for making these selections are described below. +Details and timelines for making these selections are described below. ## How are tiers changing? {#how-are-tiers-changing} -We are transitioning from a model that organizes paid tiers purely by "service types" which are delineated by both capacity and features (namely, these are Development, Production, and Dedicated tiers) to one that organizes paid tiers by feature availability. These new tiers are called Basic, Scale, and Enterprise and are described in more detail below. +We are transitioning from a model that organizes paid tiers purely by "service types" which are delineated by both capacity and features (namely, these are Development, Production, and Dedicated tiers) to one that organizes paid tiers by feature availability. These new tiers are called Basic, Scale, and Enterprise and are described in more detail below. This change brings several key benefits: * **Consistent Feature Access**: Features present in a tier will be available in that tier for all sizes of services, as well as in all tiers above it. For example, private networking, previously available only for Production service types, will now be accessible for all services starting with the Scale tier, so you can deploy it for services sized both for development and production workloads as you see fit. -* **Organizational-Level Features**: We can now provide features built at an organizational level with the appropriate plan, ensuring that customers receive the tools they need at the right level of service. For example, access to SSO (single-sign-on) and CMEK (customer-managed encryption keys) will be available at the Enterprise tier. +* **Organizational-Level Features**: We can now provide features built at an organizational level with the appropriate plan, ensuring that customers receive the tools they need at the right level of service. For example, access to SSO (single-sign-on) and CMEK (customer-managed encryption keys) will be available at the Enterprise tier. * **Optimized Support Plans**: The new packaging structure also allows us to align support response times with paid tiers, which more effectively meet the needs of our diverse customer base. For example, we are now making named support engineers available to our Enterprise tier customers. -Below we provide an overview of the new tiers, describe how they relate to use cases, and outline key features. + Below we provide an overview of the new tiers, describe how they relate to use cases, and outline key features. -**Basic: A taste of ClickHouse** + **Basic: A taste of ClickHouse** * Basic tier is designed to offer a budget-friendly option for organizations with smaller data volumes and less demanding workloads. It allows you to run single-replica deployments with up to 12GB of memory and less than 1TB of storage and is ideal for small-scale use cases that do not require reliability guarantees. -**Scale: Enhanced SLAs and scalability** + **Scale: Enhanced SLAs and scalability** * Scale tier is suitable for workloads that require enhanced SLAs, greater scalability, and advanced security measures. * It offers unlimited compute and storage with any replication factor, access to compute-compute separation, and automatic vertical and horizontal scaling. * Key features include: - * Support for private networking, customized backup controls, multi-factor auth, and more - * Compute-compute separation for optimized resource usage - * Flexible scaling options (both vertical and horizontal) to meet changing demands + * Support for private networking, customized backup controls, multi-factor auth, and more + * Compute-compute separation for optimized resource usage + * Flexible scaling options (both vertical and horizontal) to meet changing demands -**Enterprise: Mission-critical deployments** + **Enterprise: Mission-critical deployments** -* Enterprise tier is the best place to run large-scale, mission-critical ClickHouse deployments. +* Enterprise tier is the best place to run large-scale, mission-critical ClickHouse deployments. * It is best suited for organizations with stringent security and compliance needs, requiring the highest levels of performance and reliability. * Key features include: - * Industry-specific compliance certifications, such as HIPAA - * Self-service access to SSO (Single Sign-On) and CMEK (Customer Managed Encryption Keys) - * Scheduled upgrades to ensure minimal disruption - * Support for custom configurations, including high-memory, high-CPU options, and private regions + * Industry-specific compliance certifications, such as HIPAA + * Self-service access to SSO (Single Sign-On) and CMEK (Customer Managed Encryption Keys) + * Scheduled upgrades to ensure minimal disruption + * Support for custom configurations, including high-memory, high-CPU options, and private regions -New tiers are described in more detail on our [website](https://clickhouse.com/pricing). + New tiers are described in more detail on our [website](https://clickhouse.com/pricing). ## How is pricing changing? {#how-is-pricing-changing} In addition to evolving our paid tiers, we are making the following adjustments to our overall pricing structure and price points: -* **Storage**: Storage price per TB will be reduced and will no longer bundle backups in the storage cost. +* **Storage**: Storage price per TB will be reduced and will no longer bundle backups in the storage cost. * **Backups**: Backups will be charged separately, with only one backup being mandatory. -* **Compute**: Compute costs will increase, varying by tier and region. This increase may be balanced by the introduction of compute-compute separation and single-replica services, which allow you to optimize compute usage by deploying and right-sizing services tailored to different workload types. -* **Data Transfer**: We are introducing charges for data egress, specifically for data transfer over the internet and cross region. Based on our analysis, most customers will not see a substantial increase in their monthly bill based on this new dimension. -* **ClickPipes**: Our managed ingest service, which was offered for free during the introductory period, will now incur charges based on compute and ingested data. Based on our analysis, most customers will not see a substantial increase in their monthly bill based on this new dimension. +* **Compute**: Compute costs will increase, varying by tier and region. This increase may be balanced by the introduction of compute-compute separation and single-replica services, which allow you to optimize compute usage by deploying and right-sizing services tailored to different workload types. +* **Data Transfer**: We are introducing charges for data egress, specifically for data transfer over the internet and cross region. Based on our analysis, most customers will not see a substantial increase in their monthly bill based on this new dimension. +* **ClickPipes**: Our managed ingest service, which was offered for free during the introductory period, will now incur charges based on compute and ingested data. Based on our analysis, most customers will not see a substantial increase in their monthly bill based on this new dimension. ## When will these changes take effect? {#when-will-these-changes-take-effect} -While changes are effective immediately for new customers, existing customers will have from 6 months to a year to transition to new plans. +While changes are effective immediately for new customers, existing customers will have from 6 months to a year to transition to new plans. Detailed breakdown of effective dates is below: -* **New Customers**: The new plans will take effect on **January 27, 2025** for new customers of ClickHouse Cloud. +* **New Customers**: The new plans will take effect on **January 27, 2025** for new customers of ClickHouse Cloud. * **Existing PAYG Customers**: Pay-as-you-go (PAYG) customers will have 6 months until **July 23, 2025** to migrate to new plans. * **Existing Committed Spend Customers**: Customers with committed spend agreements can renegotiate their terms at the end of their current contract. -* **New usage dimensions** for Data Transfer and ClickPipes are effective for both PAYG and Committed Spend customers 8 weeks following this announcement on **March 24, 2025**. +* **New usage dimensions** for Data Transfer and ClickPipes are effective for both PAYG and Committed Spend customers 8 weeks following this announcement on **March 24, 2025**. ## What actions should you take? {#what-actions-should-you-take} -If you are a **pay-as-you-go (PAYG) customer**, you can migrate to a new plan through the self-service options available in your ClickHouse Cloud console. +If you are a **pay-as-you-go (PAYG) customer**, you can migrate to a new plan through the self-service options available in your ClickHouse Cloud console. If you are a **committed spend customer**, please reach out to your account representative to discuss your custom migration plan and timeline. diff --git a/docs/cloud/manage/network-data-transfer.mdx b/docs/cloud/manage/network-data-transfer.mdx index f200ce28362..44f2c994103 100644 --- a/docs/cloud/manage/network-data-transfer.mdx +++ b/docs/cloud/manage/network-data-transfer.mdx @@ -7,32 +7,31 @@ description: 'Learn more about how ClickHouse Cloud meters data transferred ingr import NetworkPricing from '@site/docs/cloud/manage/_snippets/_network_transfer_rates.md'; -ClickHouse Cloud meters data transferred ingress and egress. -This includes any data in and out of ClickHouse Cloud as well as any intra-region and cross-region data transfer. +ClickHouse Cloud meters data transferred ingress and egress. +This includes any data in and out of ClickHouse Cloud as well as any intra-region and cross-region data transfer. This usage is tracked at the service level. Based on this usage, customers incur data transfer charges that are then added to their monthly bill. ClickHouse Cloud charges for: -- Data egress from ClickHouse Cloud to the public Internet, including to other regions of other cloud providers. +- Data egress from ClickHouse Cloud to the public Internet, including to other regions of other cloud providers. - Data egress to another region in the same cloud provider. -There are no charges for intra-region data transfer or Private Link/Private Service Connect use and data transfer. -However, we reserve the right to implement additional data transfer pricing dimensions if we see usage patterns that impact our ability to charge users appropriately. + There are no charges for intra-region data transfer or Private Link/Private Service Connect use and data transfer. + However, we reserve the right to implement additional data transfer pricing dimensions if we see usage patterns that impact our ability to charge users appropriately. -Data transfer charges vary by Cloud Service Provider (CSP) and region. -Public internet egress pricing is based only on the origin region. -Inter-region (or cross-region) pricing depends on both the origin and destination regions. + Data transfer charges vary by Cloud Service Provider (CSP) and region. + Public internet egress pricing is based only on the origin region. + Inter-region (or cross-region) pricing depends on both the origin and destination regions. -**Best Practices to minimize Data Transfer Costs** + **Best Practices to minimize Data Transfer Costs** -There are some patterns to keep in mind when ingressing and egressing data to minimize data transfer costs. + There are some patterns to keep in mind when ingressing and egressing data to minimize data transfer costs. 1. When ingressing or egressing data from Clickhouse Cloud, use compression where possible, to minimize the amount of data transferred and the associated cost. 2. Be aware that when doing an INSERT over the native protocol with non-inlined values (e.g. INSERT INTO [TABLE] FROM INFILE [FILE] FORMAT NATIVE), ClickHouse clients pull metadata from servers to pack the data. If the metadata is larger than the INSERT payload, you might counterintuitively see more egress than there is ingress from the server perspective. If this is unacceptable, consider inlining data with VALUES syntax or using the HTTP protocol. + The tables below shows how data transfer charges for egress vary across public internet or cross-region by cloud provider and region. -The tables below shows how data transfer charges for egress vary across public internet or cross-region by cloud provider and region. + :::note + ClickHouse Cloud meters inter-region usage in terms of tiers, Tier 1 through Tier 4, depending on the origin and destination regions. The table below shows the tier for each combination of inter-region data transfer. In the Billing usage screen on ClickHouse Cloud you will see data transfer usage broken out by tiers. + ::: -:::note -ClickHouse Cloud meters inter-region usage in terms of tiers, Tier 1 through Tier 4, depending on the origin and destination regions. The table below shows the tier for each combination of inter-region data transfer. In the Billing usage screen on ClickHouse Cloud you will see data transfer usage broken out by tiers. -::: - - + diff --git a/docs/cloud/manage/notifications.md b/docs/cloud/manage/notifications.md index 708c41b2274..84e619082e3 100644 --- a/docs/cloud/manage/notifications.md +++ b/docs/cloud/manage/notifications.md @@ -23,10 +23,8 @@ Notifications can be received via various channels. For now, ClickHouse Cloud su ClickHouse Cloud notifications flyout - ClickHouse Cloud notifications activity log - ## Customizing notifications {#customizing-notifications} For each notification, you can customize how you receive the notification. You can access the settings screen from the notifications flyout or from the second tab on the notifications activity log. diff --git a/docs/cloud/manage/openapi.md b/docs/cloud/manage/openapi.md index 919cb38cc48..89878121892 100644 --- a/docs/cloud/manage/openapi.md +++ b/docs/cloud/manage/openapi.md @@ -22,42 +22,40 @@ This document covers the ClickHouse Cloud API. For database API endpoints, pleas 1. You can use the **API Keys** tab on the left menu to create and manage your API keys. - API Keys tab + API Keys tab 2. The **API Keys** page will initially display a prompt to create your first API key as shown below. After your first key is created, you can create new keys using the `New API Key` button that appears in the top right corner. - API Keys page - + API Keys page + 3. To create an API key, specify the key name, permissions for the key, and expiration time, then click `Generate API Key`. -
-:::note -Permissions align with ClickHouse Cloud [predefined roles](/cloud/security/cloud-access-management/overview#console-users-and-roles). The developer role has read-only permissions for assigned services and the admin role has full read and write permissions. -::: + :::note + Permissions align with ClickHouse Cloud [predefined roles](/cloud/security/cloud-access-management/overview#console-users-and-roles). The developer role has read-only permissions for assigned services and the admin role has full read and write permissions. + ::: - Create API key form + Create API key form 4. The next screen will display your Key ID and Key secret. Copy these values and put them somewhere safe, such as a vault. The values will not be displayed after you leave this screen. - API key details + API key details 5. The ClickHouse Cloud API uses [HTTP Basic Authentication](https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication) to verify the validity of your API keys. Here is an example of how to use your API keys to send requests to the ClickHouse Cloud API using `curl`: -```bash -$ KEY_ID=mykeyid -$ KEY_SECRET=mykeysecret + ```bash + $ KEY_ID=mykeyid + $ KEY_SECRET=mykeysecret -$ curl --user $KEY_ID:$KEY_SECRET https://api.clickhouse.cloud/v1/organizations -``` + $ curl --user $KEY_ID:$KEY_SECRET https://api.clickhouse.cloud/v1/organizations + ``` 6. Returning to the **API Keys** page, you will see the key name, last four characters of the Key ID, permissions, status, expiration date, and creator. You are able to edit the key name, permissions, and expiration from this screen. Keys may also be disabled or deleted form this screen. -
-:::note -Deleting an API key is a permanent action. Any services using the key will immediately lose access to ClickHouse Cloud. -::: + :::note + Deleting an API key is a permanent action. Any services using the key will immediately lose access to ClickHouse Cloud. + ::: - API Keys management page + API Keys management page ## Endpoints {#endpoints} -Refer details on endpoints, refer to the [API reference](https://clickhouse.com/docs/cloud/manage/api/swagger). +Refer details on endpoints, refer to the [API reference](https://clickhouse.com/docs/cloud/manage/api/swagger). Use your API Key and API Secret with the base URL `https://api.clickhouse.cloud/v1`. diff --git a/docs/cloud/manage/postman.md b/docs/cloud/manage/postman.md index d1917939568..86ce914d9a6 100644 --- a/docs/cloud/manage/postman.md +++ b/docs/cloud/manage/postman.md @@ -24,56 +24,56 @@ import postman15 from '@site/static/images/cloud/manage/postman/postman15.png'; import postman16 from '@site/static/images/cloud/manage/postman/postman16.png'; import postman17 from '@site/static/images/cloud/manage/postman/postman17.png'; -This guide will help you test the ClickHouse Cloud API using [Postman](https://www.postman.com/product/what-is-postman/). +This guide will help you test the ClickHouse Cloud API using [Postman](https://www.postman.com/product/what-is-postman/). The Postman Application is available for use within a web browser or can be downloaded to a desktop. ### Create an account {#create-an-account} * Free accounts are available at [https://www.postman.com](https://www.postman.com). -Postman site + Postman site ### Create a workspace {#create-a-workspace} -* Name your workspace and set the visibility level. +* Name your workspace and set the visibility level. -Create workspace + Create workspace ### Create a collection {#create-a-collection} -* Below "Explore" on the top left Menu click "Import": +* Below "Explore" on the top left Menu click "Import": -Explore > Import + Explore > Import * A modal will appear: -API URL entry + API URL entry * Enter the API address: "https://api.clickhouse.cloud/v1" and press 'Enter': -Import + Import * Select "Postman Collection" by clicking on the "Import" button: -Collection > Import + Collection > Import ### Interface with the ClickHouse Cloud API spec {#interface-with-the-clickhouse-cloud-api-spec} * The "API spec for ClickHouse Cloud" will now appear within "Collections" (Left Navigation). -Import your API + Import your API * Click on "API spec for ClickHouse Cloud." From the middle pain select the 'Authorization' tab: -Import complete + Import complete ### Set authorization {#set-authorization} * Toggle the dropdown menu to select "Basic Auth": -Basic auth + Basic auth * Enter the Username and Password received when you set up your ClickHouse Cloud API keys: -credentials + credentials ### Enable variables {#enable-variables} @@ -84,8 +84,7 @@ The Postman Application is available for use within a web browser or can be down * Within the "Collection", click the "Variable" tab in the middle pane (The Base URL will have been set by the earlier API import): * Below `baseURL` click the open field "Add new value", and Substitute your organization ID and service ID: -Organization ID and Service ID - + Organization ID and Service ID ## Test the ClickHouse Cloud API functionalities {#test-the-clickhouse-cloud-api-functionalities} @@ -94,26 +93,26 @@ The Postman Application is available for use within a web browser or can be down * Under the "OpenAPI spec for ClickHouse Cloud", expand the folder > V1 > organizations * Click "GET list of available organizations" and press the blue "Send" button on the right: -Test retrieval of organizations + Test retrieval of organizations * The returned results should deliver your organization details with "status": 200. (If you receive a "status" 400 with no organization information your configuration is not correct). -Status + Status ### Test "GET organizational details" {#test-get-organizational-details} * Under the `organizationid` folder, navigate to "GET organizational details": * In the middle frame menu under Params an `organizationid` is required. -Test retrieval of organization details + Test retrieval of organization details * Edit this value with `orgid` in curly braces `{{orgid}}` (From setting this value earlier a menu will appear with the value): -Submit test + Submit test * After pressing the "Save" button, press the blue "Send" button at the top right of the screen. -Return value + Return value * The returned results should deliver your organization details with "status": 200. (If you receive a "status" 400 with no organization information your configuration is not correct). @@ -123,7 +122,6 @@ The Postman Application is available for use within a web browser or can be down * Edit the Values for `organizationid` and `serviceid` with `{{orgid}}` and `{{serviceid}}` respectively. * Press "Save" and then the blue "Send" button on the right. -List of services + List of services * The returned results should deliver a list of your services and their details with "status": 200. (If you receive a "status" 400 with no service(s) information your configuration is not correct). - diff --git a/docs/cloud/manage/scaling.md b/docs/cloud/manage/scaling.md index c5e50664609..243e9b61a29 100644 --- a/docs/cloud/manage/scaling.md +++ b/docs/cloud/manage/scaling.md @@ -29,14 +29,14 @@ For Enterprise tier services scaling works as follows: - **Horizontal scaling**: Manual horizontal scaling will be available across all standard and custom profiles on the enterprise tier. - **Vertical scaling**: - - Standard profiles (1:4) will support vertical autoscaling. - - Custom profiles will not support vertical autoscaling or manual vertical scaling at launch. However, these services can be scaled vertically by contacting support. + - Standard profiles (1:4) will support vertical autoscaling. + - Custom profiles will not support vertical autoscaling or manual vertical scaling at launch. However, these services can be scaled vertically by contacting support. -:::note -Scaling in ClickHouse Cloud happens in what we call "Make Before Break" (MBB) approach. This adds one or more replicas of the new size before removing the old replicas, preventing any loss of capacity during scaling operations. By eliminating the gap between removing existing replicas and adding new ones, MBB creates a more seamless and less disruptive scaling process. It is especially beneficial in scale-up scenarios, where high resource utilization triggers the need for additional capacity, since removing replicas prematurely would only exacerbate the resource constraints. As part of this approach we wait up to an hour to let any existing queries complete on the older replicas before we will remove them. This balances the need for existing queries to complete, while at the same time ensuring that older replicas do not linger around for too long. + :::note + We are introducing a new vertical scaling mechanism for compute replicas, which we call "Make Before Break" (MBB). This approach adds one or more replicas of the new size before removing the old replicas, preventing any loss of capacity during scaling operations. By eliminating the gap between removing existing replicas and adding new ones, MBB creates a more seamless and less disruptive scaling process. It is especially beneficial in scale-up scenarios, where high resource utilization triggers the need for additional capacity, since removing replicas prematurely would only exacerbate the resource constraints. -Please note that as part of this change, historical system table data will be retained for up to a maximum of 30 days as part of scaling events. In addition, any system table data older than December 19, 2024, for services on AWS or GCP and older than January 14, 2025, for services on Azure will not be retained as part of the migration to the new organization tiers. -::: + Please note that as part of this change, historical system table data will be retained for up to a maximum of 30 days as part of scaling events. In addition, any system table data older than December 19, 2024, for services on AWS or GCP and older than January 14, 2025, for services on Azure will not be retained as part of the migration to the new organization tiers. + ::: ### Vertical auto scaling {#vertical-auto-scaling} @@ -126,9 +126,9 @@ Use automatic idling only if your use case can handle a delay before responding ## Handling spikes in workload {#handling-bursty-workloads} If you have an upcoming expected spike in your workload, you can use the -[ClickHouse Cloud API](/cloud/manage/api/api-overview) to +[ClickHouse Cloud API](/cloud/manage/api/api-overview) to preemptively scale up your service to handle the spike and scale it down once -the demand subsides. +the demand subsides. To understand the current CPU cores and memory in use for each of your replicas, you can run the query below: diff --git a/docs/cloud/manage/upgrades.md b/docs/cloud/manage/upgrades.md index 82da53cf7b4..36dd5487211 100644 --- a/docs/cloud/manage/upgrades.md +++ b/docs/cloud/manage/upgrades.md @@ -46,9 +46,9 @@ The three release channels are: - The [**regular release channel**](#regular-release-channel) is the default, and upgrades on this channel start two weeks after the fast release channel upgrades. If your service on the Scale and Enterprise tier does not have a release channel set, it is on the regular release channel by default. - The [**slow release channel**](#slow-release-channel-deferred-upgrades) is for deferred release. Upgrades on this channel occur two weeks after the regular release channel upgrades. -:::note -Basic tier services are automatically enlisted to the fast release channel -::: + :::note + Basic tier services are automatically enlisted to the fast release channel + ::: ### Fast release channel (early upgrades) {#fast-release-channel-early-upgrades} @@ -61,19 +61,17 @@ Specifically, services will: - Receive the latest ClickHouse releases - More frequent upgrades as new releases are tested -You can modify the release schedule of the service in the Cloud console as shown below: + You can modify the release schedule of the service in the Cloud console as shown below: -
+
Select Plan -
-
+
-
+
Select Plan -
-
+
-This **Fast release** channel is suitable for testing new features in non-critical environments. **It is not recommended for production workloads with strict uptime and reliability requirements.** + This **Fast release** channel is suitable for testing new features in non-critical environments. **It is not recommended for production workloads with strict uptime and reliability requirements.** ### Regular release channel {#regular-release-channel} diff --git a/docs/cloud/migrate/upload-a-csv-file.md b/docs/cloud/migrate/upload-a-csv-file.md index 71347b0f55e..a90411c4c43 100644 --- a/docs/cloud/migrate/upload-a-csv-file.md +++ b/docs/cloud/migrate/upload-a-csv-file.md @@ -32,82 +32,52 @@ following formats: | `TabSeparatedWithNamesAndTypes` | - ## Upload a file {#upload-file} - From the Cloud homepage, select your service as shown below: - upload_file_02 - If your service is idle you will need to wake it. - Select `Data sources` in the left hand tab as shown below: - upload_file_03 - Next select `Upload a file` on the right side of the data sources page: - upload_file_04 - -A file dialogue will pop up allowing you to select the file that you wish to +A file dialogue will pop up allowing you to select the file that you wish to use to insert data into a table on your Cloud service. - upload_file_05 - ## Configure table {#configure-table} - -Once the file has uploaded you will be able to configure the table where you want +Once the file has uploaded you will be able to configure the table where you want to insert the data to. A preview of the table with the first three rows is shown. - upload_file_08 - You can now select a destination table. The options are: - - a new table - an existing table - -
-You can specify which database you want to upload the data to, and in the case of +You can specify which database you want to upload the data to, and in the case of a new table, the name of the table that will be created. You will also be able to select the sorting key: - upload_file_05 - Columns read from the file are shown as `Source field`s and for each field, you can change: - the inferred type - the default value - whether to make the column [Nullable](/sql-reference/data-types/nullable) or not - upload_file_06 - :::note Excluding fields You can also remove a field if you don't want to include it in the import ::: - You can specify the type of table engine that you want to use: - - `MergeTree` - `ReplacingMergeTree` - `SummingMergeTree` - `Null` -
You can specify a partitioning key expression and primary key expression. - upload_file_07 - Click `Import to ClickHouse` (shown above) to import the data. The data import will be queued as indicated by the `queued` status badge in the `Status` column as shown below. You can also click `Open as query` (shown above) to open the insert query in the SQL console. The query will insert the file which was uploaded to an S3 bucket using the `URL` table function. - upload_file_09 - -If the job fails you will see a `failed` status badge under the `Status` column of -the `Data upload history` tab. You can click `View Details` for more information +If the job fails you will see a `failed` status badge under the `Status` column of +the `Data upload history` tab. You can click `View Details` for more information on why the upload failed. You may need to modify the table configuration or clean the data based on the error message for the failed insert. - upload_file_11 - -
\ No newline at end of file + diff --git a/docs/cloud/reference/byoc.md b/docs/cloud/reference/byoc.md index d59ae44c00f..bcaa85f1286 100644 --- a/docs/cloud/reference/byoc.md +++ b/docs/cloud/reference/byoc.md @@ -28,7 +28,7 @@ BYOC (Bring Your Own Cloud) allows you to deploy ClickHouse Cloud on your own cl BYOC is currently only supported for AWS. You can join the wait list for GCP and Azure [here](https://clickhouse.com/cloud/bring-your-own-cloud). -:::note +:::note BYOC is designed specifically for large-scale deployments, and requires customers to sign a committed contract. ::: @@ -95,20 +95,12 @@ By default, ClickHouse Cloud will provision a dedicated VPC for better isolation 2. Ensure each subnet has a minimum CIDR range of `/23` (e.g., 10.0.0.0/23) to provide sufficient IP addresses for the ClickHouse deployment. 3. Add the tag `kubernetes.io/role/internal-elb=1` to each subnet to enable proper load balancer configuration. -
- -BYOC VPC Subnet - -
- -
- -BYOC VPC Subnet Tags + BYOC VPC Subnet -
+ BYOC VPC Subnet Tags -**Contact ClickHouse Support** -Create a support ticket with the following information: + **Contact ClickHouse Support** + Create a support ticket with the following information: * Your AWS account ID * The AWS region where you want to deploy the service @@ -116,7 +108,6 @@ Create a support ticket with the following information: * The Private Subnet IDs you've allocated for ClickHouse * The availability zones these subnets are in - ### Optional: Setup VPC Peering {#optional-setup-vpc-peering} To create or delete VPC peering for ClickHouse BYOC, follow the steps: @@ -132,11 +123,7 @@ Contact ClickHouse Support to enable Private Load Balancer. 5. Set the VPC Acceptor to the target VPC ID. (Select another account if applicable) 6. Click Create Peering Connection. -
- -BYOC Create Peering Connection - -
+ BYOC Create Peering Connection #### Step 3 Accept the peering connection request {#step-3-accept-the-peering-connection-request} Go to the peering account, in the (VPC -> Peering connections -> Actions -> Accept request) page customer can approve this VPC peering request. @@ -156,11 +143,7 @@ In ClickHouse BYOC account, 5. Enter the CIDR range of the target VPC for the Destination. 6. Select “Peering Connection” and the ID of the peering connection for the Target. -
- -BYOC Add route table - -
+ BYOC Add route table #### Step 5 Add destination to the target VPC route tables {#step-5-add-destination-to-the-target-vpc-route-tables} In the peering AWS account, @@ -171,11 +154,7 @@ In the peering AWS account, 5. Enter the CIDR range of the ClickHouse VPC for the Destination. 6. Select “Peering Connection” and the ID of the peering connection for the Target. -
- -BYOC Add route table - -
+ BYOC Add route table #### Step 6: Edit security group to allow peered VPC access {#step-6-edit-security-group-to-allow-peered-vpc-access} In the ClickHouse BYOC account, you need to update the Security Group settings to allow traffic from your peered VPC. Please contact ClickHouse Support to request the addition of inbound rules that include the CIDR ranges of your peered VPC. @@ -187,7 +166,7 @@ To access ClickHouse privately, a private load balancer and endpoint are provisi - **Public endpoint**: `h5ju65kv87.mhp0y4dmph.us-west-2.aws.byoc.clickhouse.cloud` - **Private endpoint**: `h5ju65kv87-private.mhp0y4dmph.us-west-2.aws.byoc.clickhouse.cloud` -Optional, after verifying that peering is working, you can request the removal of the public load balancer for ClickHouse BYOC. + Optional, after verifying that peering is working, you can request the removal of the public load balancer for ClickHouse BYOC. ## Upgrade process {#upgrade-process} @@ -217,21 +196,21 @@ In addition to the `ClickHouseManagementRole` created via CloudFormation, the co These roles are assumed by applications running within the customer's EKS cluster: - **State Exporter Role** - - ClickHouse component that reports service health information to ClickHouse Cloud. - - Requires permission to write to an SQS queue owned by ClickHouse Cloud. + - ClickHouse component that reports service health information to ClickHouse Cloud. + - Requires permission to write to an SQS queue owned by ClickHouse Cloud. - **Load-Balancer Controller** - - Standard AWS load balancer controller. - - EBS CSI Controller to manage volumes for ClickHouse services. + - Standard AWS load balancer controller. + - EBS CSI Controller to manage volumes for ClickHouse services. - **External-DNS** - - Propagates DNS configurations to Route 53. + - Propagates DNS configurations to Route 53. - **Cert-Manager** - - Provisions TLS certificates for BYOC service domains. + - Provisions TLS certificates for BYOC service domains. - **Cluster Autoscaler** - - Adjusts the node group size as needed. + - Adjusts the node group size as needed. -**K8s-control-plane** and **k8s-worker** roles are meant to be assumed by AWS EKS services. + **K8s-control-plane** and **k8s-worker** roles are meant to be assumed by AWS EKS services. -Lastly, **`data-plane-mgmt`** allows a ClickHouse Cloud Control Plane component to reconcile necessary custom resources, such as `ClickHouseCluster` and the Istio Virtual Service/Gateway. + Lastly, **`data-plane-mgmt`** allows a ClickHouse Cloud Control Plane component to reconcile necessary custom resources, such as `ClickHouseCluster` and the Istio Virtual Service/Gateway. ## Network boundaries {#network-boundaries} @@ -242,13 +221,13 @@ This section covers different network traffic to and from the customer BYOC VPC: - **Public**: A network endpoint accessible from the public internet. - **Private**: A network endpoint accessible only through private connections, such as VPC peering, VPC Private Link, or Tailscale. -**Istio ingress is deployed behind an AWS NLB to accept ClickHouse client traffic.** + **Istio ingress is deployed behind an AWS NLB to accept ClickHouse client traffic.** -*Inbound, Public (can be Private)* + *Inbound, Public (can be Private)* -The Istio ingress gateway terminates TLS. The certificate, provisioned by CertManager with Let's Encrypt, is stored as a secret within the EKS cluster. Traffic between Istio and ClickHouse is [encrypted by AWS](https://docs.aws.amazon.com/whitepapers/latest/logical-separation/encrypting-data-at-rest-and--in-transit.html#:~:text=All%20network%20traffic%20between%20AWS,supported%20Amazon%20EC2%20instance%20types) since they reside in the same VPC. + The Istio ingress gateway terminates TLS. The certificate, provisioned by CertManager with Let's Encrypt, is stored as a secret within the EKS cluster. Traffic between Istio and ClickHouse is [encrypted by AWS](https://docs.aws.amazon.com/whitepapers/latest/logical-separation/encrypting-data-at-rest-and--in-transit.html#:~:text=All%20network%20traffic%20between%20AWS,supported%20Amazon%20EC2%20instance%20types) since they reside in the same VPC. -By default, ingress is publicly accessible with IP allow list filtering. Customers can configure VPC peering to make it private and disable public connections. We highly recommend setting up an [IP filter](/cloud/security/setting-ip-filters) to restrict access. + By default, ingress is publicly accessible with IP allow list filtering. Customers can configure VPC peering to make it private and disable public connections. We highly recommend setting up an [IP filter](/cloud/security/setting-ip-filters) to restrict access. ### Troubleshooting access {#troubleshooting-access} @@ -284,16 +263,16 @@ State Exporter sends ClickHouse service state information to an SQS owned by Cli - **SharedMergeTree**: ClickHouse Cloud and BYOC use the same binary and configuration. Therefore all features from ClickHouse core are supported in BYOC such as SharedMergeTree. - **Console access for managing service state**: - - Supports operations such as start, stop, and terminate. - - View services and status. + - Supports operations such as start, stop, and terminate. + - View services and status. - **Backup and restore.** - **Manual vertical and horizontal scaling.** - **Idling.** - **Warehouses**: Compute-Compute Separation - **Zero Trust Network via Tailscale.** - **Monitoring**: - - The Cloud console includes built-in health dashboards for monitoring service health. - - Prometheus scraping for centralized monitoring with Prometheus, Grafana, and Datadog. See the [Prometheus documentation](/integrations/prometheus) for setup instructions. + - The Cloud console includes built-in health dashboards for monitoring service health. + - Prometheus scraping for centralized monitoring with Prometheus, Grafana, and Datadog. See the [Prometheus documentation](/integrations/prometheus) for setup instructions. - **VPC Peering.** - **Integrations**: See the full list on [this page](/integrations). - **Secure S3.** @@ -423,18 +402,18 @@ global: scrape_configs: - job_name: "prometheus" - static_configs: - - targets: ["localhost:9090"] + static_configs: + - targets: ["localhost:9090"] - job_name: "clickhouse" - static_configs: + static_configs: - targets: ["..aws.byoc.clickhouse.cloud:8443"] - scheme: https - metrics_path: "/metrics_all" - basic_auth: + scheme: https + metrics_path: "/metrics_all" + basic_auth: username: password: - honor_labels: true -``` + honor_labels: true + ``` Please also see [this blog post](https://clickhouse.com/blog/clickhouse-cloud-now-supports-prometheus-monitoring) and the [Prometheus setup docs for ClickHouse](/integrations/prometheus). diff --git a/docs/cloud/reference/changelog.md b/docs/cloud/reference/changelog.md index 7a2512e712f..a28919ed12e 100644 --- a/docs/cloud/reference/changelog.md +++ b/docs/cloud/reference/changelog.md @@ -34,124 +34,123 @@ In addition to this ClickHouse Cloud changelog, please see the [Cloud Compatibil ## July 11, 2025 {#june-11-2025} - New services now store database and table metadata in a central **SharedCatalog**, - a new model for coordination and object lifecycles which enables: + a new model for coordination and object lifecycles which enables: - **Cloud-scale DDL**, even under high concurrency - **Resilient deletion and new DDL operations** - **Fast spin-up and wake-ups** as stateless nodes now launch with no disk dependencies - **Stateless compute across both native and open formats**, including Iceberg and Delta Lake - - Read more about SharedCatalog in our [blog](https://clickhouse.com/blog/clickhouse-cloud-stateless-compute) + + Read more about SharedCatalog in our [blog](https://clickhouse.com/blog/clickhouse-cloud-stateless-compute) - We now support the ability to launch HIPAA compliant services in GCP `europe-west4` ## June 27, 2025 {#june-27-2025} - We now officially support a Terraform provider for managing database privileges - which is also compatible with self-managed deployments. Please refer to the - [blog](https://clickhouse.com/blog/new-terraform-provider-manage-clickhouse-database-users-roles-and-privileges-with-code) - and our [docs](https://registry.terraform.io/providers/ClickHouse/clickhousedbops/latest/docs) - for more information. -- Enterprise tier services can now enlist in the [slow release channel](/manage/updates/#slow-release-channel-deferred-upgrades) to defer - upgrades by two weeks after the regular release to permit additional time for - testing. + which is also compatible with self-managed deployments. Please refer to the + [blog](https://clickhouse.com/blog/new-terraform-provider-manage-clickhouse-database-users-roles-and-privileges-with-code) + and our [docs](https://registry.terraform.io/providers/ClickHouse/clickhousedbops/latest/docs) + for more information. +- Enterprise tier services can now enlist in the [slow release channel](/manage/updates/#slow-release-channel-deferred-upgrades) to defer + upgrades by two weeks after the regular release to permit additional time for + testing. ## June 13, 2025 {#june-13-2025} - We're excited to announce that ClickHouse Cloud Dashboards are now generally available. Dashboards allow users to visualize queries on dashboards, interact with data via filters and query parameters, and manage sharing. -- API key IP filters: we are introducing an additional layer of protection for your interactions with ClickHouse Cloud. When generating an API key, you may setup an IP allow list to limit where the API key may be used. Please refer to the [documentation](https://clickhouse.com/docs/cloud/security/setting-ip-filters) for details. +- API key IP filters: we are introducing an additional layer of protection for your interactions with ClickHouse Cloud. When generating an API key, you may setup an IP allow list to limit where the API key may be used. Please refer to the [documentation](https://clickhouse.com/docs/cloud/security/setting-ip-filters) for details. ## May 30, 2025 {#may-30-2025} - We're excited to announce general availability of **ClickPipes for Postgres CDC** - in ClickHouse Cloud. With just a few clicks, you can now replicate your Postgres - databases and unlock blazing-fast, real-time analytics. The connector delivers - faster data synchronization, latency as low as a few seconds, automatic schema changes, - fully secure connectivity, and more. Refer to the - [blog](https://clickhouse.com/blog/postgres-cdc-connector-clickpipes-ga) for - more information. To get started, refer to the instructions [here](https://clickhouse.com/docs/integrations/clickpipes/postgres). + in ClickHouse Cloud. With just a few clicks, you can now replicate your Postgres + databases and unlock blazing-fast, real-time analytics. The connector delivers + faster data synchronization, latency as low as a few seconds, automatic schema changes, + fully secure connectivity, and more. Refer to the + [blog](https://clickhouse.com/blog/postgres-cdc-connector-clickpipes-ga) for + more information. To get started, refer to the instructions [here](https://clickhouse.com/docs/integrations/clickpipes/postgres). - Introduced new improvements to the SQL console dashboards: - - Sharing: You can share your dashboard with your team members. Four levels of access are supported, that can be adjusted both globally and on a per-user basis: + - Sharing: You can share your dashboard with your team members. Four levels of access are supported, that can be adjusted both globally and on a per-user basis: - _Write access_: Add/edit visualizations, refresh settings, interact with dashboards via filters. - _Owner_: Share a dashboard, delete a dashboard, and all other permissions of a user with "write access". - _Read-only access_: View and interact with dashboard via filters - _No access_: Cannot view a dashboard - - For existing dashboards that have already been created, Organization Administrators can assign existing dashboards to themselves as owners. - - You can now add a table or chart from the SQL console to a dashboard from the query view. - -Dashboards improvements + - For existing dashboards that have already been created, Organization Administrators can assign existing dashboards to themselves as owners. + - You can now add a table or chart from the SQL console to a dashboard from the query view. + Dashboards improvements -- We are enlisting preview participants for [Distributed cache](https://clickhouse.com/cloud/distributed-cache-waitlist) - for AWS and GCP. Read more in the [blog](https://clickhouse.com/blog/building-a-distributed-cache-for-s3). +- We are enlisting preview participants for [Distributed cache](https://clickhouse.com/cloud/distributed-cache-waitlist) + for AWS and GCP. Read more in the [blog](https://clickhouse.com/blog/building-a-distributed-cache-for-s3). ## May 16, 2025 {#may-16-2025} -- Introduced the Resource Utilization Dashboard which provides a view of - resources being used by a service in ClickHouse Cloud. The following metrics - are scraped from system tables, and displayed on this dashboard: - * Memory & CPU: Graphs for `CGroupMemoryTotal` (Allocated Memory), `CGroupMaxCPU` (allocated CPU), +- Introduced the Resource Utilization Dashboard which provides a view of + resources being used by a service in ClickHouse Cloud. The following metrics + are scraped from system tables, and displayed on this dashboard: + * Memory & CPU: Graphs for `CGroupMemoryTotal` (Allocated Memory), `CGroupMaxCPU` (allocated CPU), `MemoryResident` (memory used), and `ProfileEvent_OSCPUVirtualTimeMicroseconds` (CPU used) - * Data Transfer: Graphs showing data ingress and egress from ClickHouse Cloud. Learn more [here](/cloud/manage/network-data-transfer). -- We're excited to announce the launch of our new ClickHouse Cloud Prometheus/Grafana mix-in, - built to simplify monitoring for your ClickHouse Cloud services. - This mix-in uses our Prometheus-compatible API endpoint to seamlessly integrate - ClickHouse metrics into your existing Prometheus and Grafana setup. It includes - a pre-configured dashboard that gives you real-time visibility into the health - and performance of your services. Refer to the launch [blog](https://clickhouse.com/blog/monitor-with-new-prometheus-grafana-mix-in) to read more. + * Data Transfer: Graphs showing data ingress and egress from ClickHouse Cloud. Learn more [here](/cloud/manage/network-data-transfer). +- We're excited to announce the launch of our new ClickHouse Cloud Prometheus/Grafana mix-in, + built to simplify monitoring for your ClickHouse Cloud services. + This mix-in uses our Prometheus-compatible API endpoint to seamlessly integrate + ClickHouse metrics into your existing Prometheus and Grafana setup. It includes + a pre-configured dashboard that gives you real-time visibility into the health + and performance of your services. Refer to the launch [blog](https://clickhouse.com/blog/monitor-with-new-prometheus-grafana-mix-in) to read more. ## April 18, 2025 {#april-18-2025} -- Introduced a new **Member** organization level role and two new service level - roles: **Service Admin** and **Service Read Only**. - **Member** is an organization level role that is assigned to SAML SSO users by - default and provides only sign-in and profile update capabilities. **Service Admin** - and **Service Read Only** roles for one or more services can be assigned to users - with **Member**, **Developer**, or **Billing Admin** roles. For more information - see ["Access control in ClickHouse Cloud"](https://clickhouse.com/docs/cloud/security/cloud-access-management/overview) +- Introduced a new **Member** organization level role and two new service level + roles: **Service Admin** and **Service Read Only**. + **Member** is an organization level role that is assigned to SAML SSO users by + default and provides only sign-in and profile update capabilities. **Service Admin** + and **Service Read Only** roles for one or more services can be assigned to users + with **Member**, **Developer**, or **Billing Admin** roles. For more information + see ["Access control in ClickHouse Cloud"](https://clickhouse.com/docs/cloud/security/cloud-access-management/overview) - ClickHouse Cloud now offers **HIPAA** and **PCI** services in the following regions - for **Enterprise** customers: AWS eu-central-1, AWS eu-west-2, AWS us-east-2. -- Introduced **user facing notifications for ClickPipes**. This feature sends - automatic alerts for ClickPipes failures via email, ClickHouse Cloud UI, and - Slack. Notifications via email and UI are enabled by default and can be - configured per pipe. For **Postgres CDC ClickPipes**, alerts also cover - replication slot threshold (configurable in the **Settings** tab), specific error - types, and self-serve steps to resolve failures. -- **MySQL CDC private preview** is now open. This lets customers replicate MySQL - databases to ClickHouse Cloud in a few clicks, enabling fast analytics and - removing the need for external ETL tools. The connector supports both continuous - replication and one-time migrations, whether MySQL is on the cloud (RDS, - Aurora, Cloud SQL, Azure, etc.) or on-premises. You can sign up to the private - preview by [following this link](https://clickhouse.com/cloud/clickpipes/mysql-cdc-connector). -- Introduced **AWS PrivateLink for ClickPipes**. You can use AWS PrivateLink to - establish secure connectivity between VPCs, AWS services, your on-premises - systems, and ClickHouse Cloud. This can be done without exposing traffic to - the public internet while moving data from sources like Postgres, MySQL, and - MSK on AWS. It also supports cross-region access through VPC service endpoints. - PrivateLink connectivity set-up is now [fully self-serve](https://clickhouse.com/docs/integrations/clickpipes/aws-privatelink) - through ClickPipes. + for **Enterprise** customers: AWS eu-central-1, AWS eu-west-2, AWS us-east-2. +- Introduced **user facing notifications for ClickPipes**. This feature sends + automatic alerts for ClickPipes failures via email, ClickHouse Cloud UI, and + Slack. Notifications via email and UI are enabled by default and can be + configured per pipe. For **Postgres CDC ClickPipes**, alerts also cover + replication slot threshold (configurable in the **Settings** tab), specific error + types, and self-serve steps to resolve failures. +- **MySQL CDC private preview** is now open. This lets customers replicate MySQL + databases to ClickHouse Cloud in a few clicks, enabling fast analytics and + removing the need for external ETL tools. The connector supports both continuous + replication and one-time migrations, whether MySQL is on the cloud (RDS, + Aurora, Cloud SQL, Azure, etc.) or on-premises. You can sign up to the private + preview by [following this link](https://clickhouse.com/cloud/clickpipes/mysql-cdc-connector). +- Introduced **AWS PrivateLink for ClickPipes**. You can use AWS PrivateLink to + establish secure connectivity between VPCs, AWS services, your on-premises + systems, and ClickHouse Cloud. This can be done without exposing traffic to + the public internet while moving data from sources like Postgres, MySQL, and + MSK on AWS. It also supports cross-region access through VPC service endpoints. + PrivateLink connectivity set-up is now [fully self-serve](https://clickhouse.com/docs/integrations/clickpipes/aws-privatelink) + through ClickPipes. ## April 4, 2025 {#april-4-2025} - Slack notifications for ClickHouse Cloud: ClickHouse Cloud now supports Slack notifications for billing, scaling, and ClickPipes events, in addition to in-console and email notifications. These notifications are sent via the ClickHouse Cloud Slack application. Organization admins can configure these notifications via the notification center by specifying slack channels to which notifications should be sent. -- Users running Production and Development services will now see ClickPipes and data transfer usage price on their bills. Please refer to the [announcement](/cloud/manage/jan-2025-faq/pricing-dimensions) from January 2025 for more details. - +- Users running Production and Development services will now see ClickPipes and data transfer usage price on their bills. Please refer to the [announcement](/cloud/manage/jan-2025-faq/pricing-dimensions) from January 2025 for more details. + ## March 21, 2025 {#march-21-2025} - Cross-region Private Link connectivity on AWS is now in Beta. Please refer to - ClickHouse Cloud private link [docs](/manage/security/aws-privatelink) for - details of how to set up and list of supported regions. -- The maximum replica size available for services on AWS is now set to 236 GiB RAM. - This allows for efficient utilization, while ensuring we have resources - allocated to background processes. + ClickHouse Cloud private link [docs](/manage/security/aws-privatelink) for + details of how to set up and list of supported regions. +- The maximum replica size available for services on AWS is now set to 236 GiB RAM. + This allows for efficient utilization, while ensuring we have resources + allocated to background processes. ## March 7, 2025 {#march-7-2025} - New `UsageCost` API endpoint: The API specification now supports a new endpoint - for retrieving usage information. This is an organization endpoint and usage - costs can be queried for a maximum of 31 days. The metrics that can be - retrieved include Storage, Compute, Data Transfer and ClickPipes. Please refer - to the [documentation](https://clickhouse.com/docs/cloud/manage/api/usageCost-api-reference) for details. + for retrieving usage information. This is an organization endpoint and usage + costs can be queried for a maximum of 31 days. The metrics that can be + retrieved include Storage, Compute, Data Transfer and ClickPipes. Please refer + to the [documentation](https://clickhouse.com/docs/cloud/manage/api/usageCost-api-reference) for details. - Terraform provider [v2.1.0](https://registry.terraform.io/providers/ClickHouse/clickhouse/2.1.0/docs/resources/service#nestedatt--endpoints_configuration) release supports enabling the MySQL endpoint. ## February 21, 2025 {#february-21-2025} @@ -165,7 +164,7 @@ need to comply with strict data residency requirements by ensuring all data stay within a secure customer environment. - For more details, you can refer to the [documentation](/cloud/reference/byoc) for BYOC - or read our [announcement blog post](https://clickhouse.com/blog/announcing-general-availability-of-clickhouse-bring-your-own-cloud-on-aws). + or read our [announcement blog post](https://clickhouse.com/blog/announcing-general-availability-of-clickhouse-bring-your-own-cloud-on-aws). - [Contact us](https://clickhouse.com/cloud/bring-your-own-cloud) to request access. ### Postgres CDC connector for ClickPipes {#postgres-cdc-connector-for-clickpipes} @@ -201,10 +200,9 @@ introducing guardrails for the number of tables, databases, partitions and parts in use. - Refer to the [usage limits](https://clickhouse.com/docs/cloud/bestpractices/usage-limits) - section of the documentation for details. + section of the documentation for details. - If your service is already above these limits, we will permit a 10% increase. - Please contact [support](https://clickhouse.com/support/program) if you have any questions. - + Please contact [support](https://clickhouse.com/support/program) if you have any questions. ## January 27, 2025 {#january-27-2025} @@ -272,20 +270,19 @@ We've added several enhancements for the Prometheus integration: - **Organization-level endpoint**. We've introduced an enhancement to our Prometheus integration for ClickHouse Cloud. In addition to service-level metrics, the API now includes an endpoint for **organization-level metrics**. This new endpoint automatically collects metrics for all services within your organization, streamlining the process of exporting metrics into your Prometheus collector. These metrics can be integrated with visualization tools like Grafana and Datadog for a more comprehensive view of your organization's performance. - This feature is available now for all users. You can find more details [here](/integrations/prometheus). + This feature is available now for all users. You can find more details [here](/integrations/prometheus). - **Filtered metrics**. We've added support for returning a filtered list of metrics in our Prometheus integration for ClickHouse Cloud. This feature helps reduce response payload size by enabling you to focus on metrics that are critical for monitoring the health of your service. - This functionality is available via an optional query parameter in the API, making it easier to optimize your data collection and streamline integrations with tools like Grafana and Datadog. - - The filtered metrics feature is now available for all users. You can find more details [here](/integrations/prometheus). + This functionality is available via an optional query parameter in the API, making it easier to optimize your data collection and streamline integrations with tools like Grafana and Datadog. + The filtered metrics feature is now available for all users. You can find more details [here](/integrations/prometheus). ## December 20, 2024 {#december-20-2024} ### Marketplace subscription organization attachment {#marketplace-subscription-organization-attachment} -You can now attach your new marketplace subscription to an existing ClickHouse Cloud organization. Once you finish subscribing to the marketplace and redirect to ClickHouse Cloud, you can connect an existing organization created in the past to the new marketplace subscription. From this point, your resources in the organization will be billed via the marketplace. +You can now attach your new marketplace subscription to an existing ClickHouse Cloud organization. Once you finish subscribing to the marketplace and redirect to ClickHouse Cloud, you can connect an existing organization created in the past to the new marketplace subscription. From this point, your resources in the organization will be billed via the marketplace. ClickHouse Cloud interface showing how to add a marketplace subscription to an existing organization @@ -295,7 +292,7 @@ It is now possible to restrict the expiry options of API keys so you don't creat ### Custom emails for notifications {#custom-emails-for-notifications} -Org Admins can now add more email addresses to a specific notification as additional recipients. This is useful in case you want to send notifications to an alias or to other users within your organization who might not be users of ClickHouse Cloud. To configure this, go to the Notification Settings from the cloud console and edit the email addresses that you want to receive the email notifications. +Org Admins can now add more email addresses to a specific notification as additional recipients. This is useful in case you want to send notifications to an alias or to other users within your organization who might not be users of ClickHouse Cloud. To configure this, go to the Notification Settings from the cloud console and edit the email addresses that you want to receive the email notifications. ## December 6, 2024 {#december-6-2024} @@ -323,11 +320,11 @@ We are excited to announce the GA release of Query API Endpoints in ClickHouse C * Result streaming * Support for all ClickHouse-compatible output formats -In addition to these improvements, we are excited to announce generic query API endpoints that, leveraging our existing framework, allow you to execute arbitrary SQL queries against your ClickHouse Cloud service(s). Generic endpoints can be enabled and configured from the service settings page. + In addition to these improvements, we are excited to announce generic query API endpoints that, leveraging our existing framework, allow you to execute arbitrary SQL queries against your ClickHouse Cloud service(s). Generic endpoints can be enabled and configured from the service settings page. -To get started, follow the [Query API Endpoints documentation](/cloud/get-started/query-endpoints). + To get started, follow the [Query API Endpoints documentation](/cloud/get-started/query-endpoints). -ClickHouse Cloud interface showing the API Endpoints configuration with various settings + ClickHouse Cloud interface showing the API Endpoints configuration with various settings ### Native JSON support (Beta) {#native-json-support-beta} @@ -502,7 +499,7 @@ This release also includes support for subscriptions via the [Microsoft Azure Ma - United States: East US 2 (Virginia) - Europe: Germany West Central (Frankfurt) -If you'd like any specific region to be supported, please [contact us](https://clickhouse.com/support/program). + If you'd like any specific region to be supported, please [contact us](https://clickhouse.com/support/program). ### Query log insights {#query-log-insights} @@ -532,7 +529,7 @@ The following options are available: - From latest: Begin consuming data from the most recent offset. This is useful for users who are only interested in new messages. - From a timestamp: Start consuming data from messages that were produced at or after a specific timestamp. This feature allows for more precise control, enabling users to resume processing from an exact point in time. -ClickPipes Kafka connector configuration interface showing offset selection options + ClickPipes Kafka connector configuration interface showing offset selection options ### Enroll services to the Fast release channel {#enroll-services-to-the-fast-release-channel} @@ -617,7 +614,7 @@ This is an open-source integration built over many months of hard work by our In **Console changes** - Output formats support in the SQL console -**Integrations changes** + **Integrations changes** - ClickPipes Kafka connector supports multi-broker setup - PowerBI connector supports providing ODBC driver configuration options. @@ -643,7 +640,6 @@ Other changes: - Java client: Fixed bug with incorrect error code parsing - Python client: Fixed parameters binding for numeric types, fixed bugs with number list in query binding, added SQLAlchemy Point support. - ## April 4, 2024 {#april-4-2024} ### Introducing the new ClickHouse Cloud console {#introducing-the-new-clickhouse-cloud-console} @@ -764,21 +760,20 @@ This release brings availability of ClickPipes for Azure Event Hub, dramatically ### Integrations changes {#integrations-changes-5} * [ClickHouse data source for Grafana](https://grafana.com/grafana/plugins/grafana-clickhouse-datasource/) v4 release - * Completely rebuilt query builder to have specialized editors for Table, Logs, Time Series, and Traces - * Completely rebuilt SQL generator to support more complicated and dynamic queries - * Added first-class support for OpenTelemetry in Log and Trace views - * Extended Configuration to allow to specify default tables and columns for Logs and Traces - * Added ability to specify custom HTTP headers - * And many more improvements - check the full [changelog](https://github.com/grafana/clickhouse-datasource/blob/main/CHANGELOG.md#400) + * Completely rebuilt query builder to have specialized editors for Table, Logs, Time Series, and Traces + * Completely rebuilt SQL generator to support more complicated and dynamic queries + * Added first-class support for OpenTelemetry in Log and Trace views + * Extended Configuration to allow to specify default tables and columns for Logs and Traces + * Added ability to specify custom HTTP headers + * And many more improvements - check the full [changelog](https://github.com/grafana/clickhouse-datasource/blob/main/CHANGELOG.md#400) * Database schema management tools - * [Flyway added ClickHouse support](https://github.com/flyway/flyway-community-db-support/packages/2037428) - * [Ariga Atlas added ClickHouse support](https://atlasgo.io/blog/2023/12/19/atlas-v-0-16#clickhouse-beta-program) + * [Flyway added ClickHouse support](https://github.com/flyway/flyway-community-db-support/packages/2037428) + * [Ariga Atlas added ClickHouse support](https://atlasgo.io/blog/2023/12/19/atlas-v-0-16#clickhouse-beta-program) * Kafka Connector Sink - * Optimized ingestion into a table with default values - * Added support for string-based dates in DateTime64 + * Optimized ingestion into a table with default values + * Added support for string-based dates in DateTime64 * Metabase - * Added support for a connection to multiple databases - + * Added support for a connection to multiple databases ## January 18, 2024 {#january-18-2024} @@ -793,10 +788,10 @@ This release brings a new region in AWS (London / eu-west-2), adds ClickPipes su ### Integrations changes {#integrations-changes-6} - Java client: - - Breaking changes: Removed the ability to specify random URL handles in the call. This functionality has been removed from ClickHouse - - Deprecations: Java CLI client and GRPC packages - - Added support for RowBinaryWithDefaults format to reduce the batch size and workload on ClickHouse instance (request by Exabeam) - - Made Date32 and DateTime64 range boundaries compatible with ClickHouse, compatibility with Spark Array string type, node selection mechanism + - Breaking changes: Removed the ability to specify random URL handles in the call. This functionality has been removed from ClickHouse + - Deprecations: Java CLI client and GRPC packages + - Added support for RowBinaryWithDefaults format to reduce the batch size and workload on ClickHouse instance (request by Exabeam) + - Made Date32 and DateTime64 range boundaries compatible with ClickHouse, compatibility with Spark Array string type, node selection mechanism - Kafka Connector: Added a JMX monitoring dashboard for Grafana - PowerBI: Made ODBC driver settings configurable in the UI - JavaScript client: Exposed query summary information, allow to provide a subset of specific columns for insertion, make keep_alive configurable for web client @@ -805,7 +800,6 @@ This release brings a new region in AWS (London / eu-west-2), adds ClickPipes su ### Reliability changes {#reliability-changes} - User-facing backward incompatible change: Previously, two features ([is_deleted](/engines/table-engines/mergetree-family/replacingmergetree#is_deleted) and ``OPTIMIZE CLEANUP``) under certain conditions could lead to corruption of the data in ClickHouse. To protect the integrity of the data of our users, while keeping the core of the functionality, we adjusted how this feature works. Specifically, the MergeTree setting ``clean_deleted_rows`` is now deprecated and has no effect anymore. The ``CLEANUP`` keyword is not allowed by default (to use it you will need to enable ``allow_experimental_replacing_merge_with_cleanup``). If you decide to use ``CLEANUP``, you need to make sure that it is always used together with ``FINAL``, and you must guarantee that no rows with older versions will be inserted after you run ``OPTIMIZE FINAL CLEANUP``. - ## December 18, 2023 {#december-18-2023} This release brings a new region in GCP (us-east1), ability to self-service secure endpoint connections, support for additional integrations including DBT 1.7, and numerous bug fixes and security enhancements. @@ -824,17 +818,16 @@ This release brings a new region in GCP (us-east1), ability to self-service secu - PowerBI Connector: Added ability to run on PowerBI Cloud - Make permissions for ClickPipes internal user configurable - Kafka Connect - - Improved deduplication logic and ingestion of Nullable types. - - Add support text-based formats (CSV, TSV) + - Improved deduplication logic and ingestion of Nullable types. + - Add support text-based formats (CSV, TSV) - Apache Beam: add support for Boolean and LowCardinality types - Nodejs client: add support for Parquet format ### Security announcements {#security-announcements} - Patched 3 security vulnerabilities - see [security changelog](/whats-new/security-changelog) for details: - - CVE 2023-47118 (CVSS 7.0) - a heap buffer overflow vulnerability affecting the native interface running by default on port 9000/tcp - - CVE-2023-48704 (CVSS 7.0) - a heap buffer overflow vulnerability affecting the native interface running by default on port 9000/tcp - - CVE 2023-48298 (CVSS 5.9) - an integer underflow vulnerability in the FPC compressions codec - + - CVE 2023-47118 (CVSS 7.0) - a heap buffer overflow vulnerability affecting the native interface running by default on port 9000/tcp + - CVE-2023-48704 (CVSS 7.0) - a heap buffer overflow vulnerability affecting the native interface running by default on port 9000/tcp + - CVE 2023-48298 (CVSS 5.9) - an integer underflow vulnerability in the FPC compressions codec ## November 22, 2023 {#november-22-2023} @@ -873,21 +866,21 @@ This release adds more regional support for development services in Asia, introd ### Integrations changes {#integrations-changes-9} - MySQL - - Improved Tableau Online and QuickSight support via MySQL + - Improved Tableau Online and QuickSight support via MySQL - Kafka Connector - - Introduced a new StringConverter to support text-based formats (CSV, TSV) - - Added support for Bytes and Decimal data types - - Adjusted Retryable Exceptions to now always be retried (even when errors.tolerance=all) + - Introduced a new StringConverter to support text-based formats (CSV, TSV) + - Added support for Bytes and Decimal data types + - Adjusted Retryable Exceptions to now always be retried (even when errors.tolerance=all) - Node.js client - - Fixed an issue with streamed large datasets providing corrupted results + - Fixed an issue with streamed large datasets providing corrupted results - Python client - - Fixed timeouts on large inserts - - Fixed NumPy/Pandas Date32 issue -​​- Golang client - - Fixed insertion of an empty map into JSON column, compression buffer cleanup, query escaping, panic on zero/nil for IPv4 and IPv6 - - Added watchdog on canceled inserts + - Fixed timeouts on large inserts + - Fixed NumPy/Pandas Date32 issue + ​​- Golang client + - Fixed insertion of an empty map into JSON column, compression buffer cleanup, query escaping, panic on zero/nil for IPv4 and IPv6 + - Added watchdog on canceled inserts - DBT - - Improved distributed table support with tests + - Improved distributed table support with tests ## October 19, 2023 {#october-19-2023} @@ -899,9 +892,9 @@ This release brings usability and performance improvements in the SQL console, b ### Integrations changes {#integrations-changes-10} - Java client: - - Switched the default network library to improve performance and reuse open connections - - Added proxy support - - Added support for secure connections with using Trust Store + - Switched the default network library to improve performance and reuse open connections + - Added proxy support + - Added support for secure connections with using Trust Store - Node.js client: Fixed keep-alive behavior for insert queries - Metabase: Fixed IPv4/IPv6 column serialization @@ -916,9 +909,9 @@ This release brings general availability of ClickPipes for Kafka, Confluent Clou ### Integrations changes {#integrations-changes-11} - Announced general availability of ClickPipes - a turnkey data ingestion service - for Kafka, Confluent Cloud, and Amazon MSK (see the [release blog](https://clickhouse.com/blog/clickpipes-is-generally-available)) - Reached general availability of Kafka Connect ClickHouse Sink - - Extended support for customized ClickHouse settings using `clickhouse.settings` property - - Improved deduplication behavior to account for dynamic fields - - Added support for `tableRefreshInterval` to re-fetch table changes from ClickHouse + - Extended support for customized ClickHouse settings using `clickhouse.settings` property + - Improved deduplication behavior to account for dynamic fields + - Added support for `tableRefreshInterval` to re-fetch table changes from ClickHouse - Fixed an SSL connection issue and type mappings between [PowerBI](/integrations/powerbi) and ClickHouse data types ## September 7, 2023 {#september-7-2023} @@ -1050,12 +1043,12 @@ This release brings the public release of the ClickHouse Cloud Programmatic API ## May 11, 2023 {#may-11-2023} This release brings the public beta of ClickHouse Cloud on GCP -(see [blog](https://clickhouse.com/blog/clickhouse-cloud-on-gcp-available-in-public-beta) +(see [blog](https://clickhouse.com/blog/clickhouse-cloud-on-gcp-available-in-public-beta) for details), extends administrators' rights to grant terminate query permissions, and adds more visibility into the status of MFA users in the Cloud console. :::note Update -ClickHouse Cloud on GCP is now GA, see the entry for June twenty above. +ClickHouse Cloud on GCP is now GA, see the entry for June twenty above. ::: ### ClickHouse Cloud on GCP is now available in public beta {#clickhouse-cloud-on-gcp-is-now-available-in-public-beta-now-ga-see-june-20th-entry-above} @@ -1138,15 +1131,14 @@ This release brings an API for retrieving cloud endpoints, an advanced scaling c - [Metabase](/integrations/data-visualization/metabase-and-clickhouse.md): Added support for multiple schemas - [Go client](/integrations/language-clients/go/index.md): Fixed idle connection liveness check for TLS connections - [Python client](/integrations/language-clients/python/index.md) - - Added support for external data in query methods - - Added timezone support for query results - - Added support for `no_proxy`/`NO_PROXY` environment variable - - Fixed server-side parameter binding of the NULL value for Nullable types + - Added support for external data in query methods + - Added timezone support for query results + - Added support for `no_proxy`/`NO_PROXY` environment variable + - Fixed server-side parameter binding of the NULL value for Nullable types ### Bug fixes {#bug-fixes-1} * Fixed behavior where running `INSERT INTO ... SELECT ...` from the SQL console incorrectly applied the same row limit as select queries - ## March 23, 2023 {#march-23-2023} This release brings database password complexity rules, significant speedup in restoring large backups, and support for displaying traces in Grafana Trace View. @@ -1161,26 +1153,25 @@ This release brings database password complexity rules, significant speedup in r ### Integrations changes {#integrations-changes-22} - Grafana: - - Added support for displaying trace data stored in ClickHouse in Trace View - - Improved time range filters and added support for special characters in table names + - Added support for displaying trace data stored in ClickHouse in Trace View + - Improved time range filters and added support for special characters in table names - Superset: Added native ClickHouse support - Kafka Connect Sink: Added automatic date conversion and Null column handling - Metabase: Implemented compatibility with v0.46 - Python client: Fixed inserts in temporary tables and added support for Pandas Null - Golang client: Normalized Date types with timezone - Java client - - Added to SQL parser support for compression, infile, and outfile keywords - - Added credentials overload - - Fixed batch support with `ON CLUSTER` + - Added to SQL parser support for compression, infile, and outfile keywords + - Added credentials overload + - Fixed batch support with `ON CLUSTER` - Node.js client - - Added support for JSONStrings, JSONCompact, JSONCompactStrings, JSONColumnsWithMetadata formats - - `query_id` can now be provided for all main client methods + - Added support for JSONStrings, JSONCompact, JSONCompactStrings, JSONColumnsWithMetadata formats + - `query_id` can now be provided for all main client methods ### Bug fixes {#bug-fixes-2} - Fixed a bug resulting in slow initial provisioning and startup times for new services - Fixed a bug that resulted in slower query performance due to cache misconfiguration - ## March 9, 2023 {#march-9-2023} This release improves observability dashboards, optimizes time to create large backups, and adds the configuration necessary to drop large tables and partitions. @@ -1221,9 +1212,9 @@ Adds support for a subset of features in ClickHouse 23.1, for example: ### Integrations changes {#integrations-changes-23} - [Kafka-Connect](/integrations/data-ingestion/kafka/index.md): Added support for Amazon MSK - [Metabase](/integrations/data-visualization/metabase-and-clickhouse.md): First stable release 1.0.0 - - Made the connector is available on [Metabase Cloud](https://www.metabase.com/start/) - - Added a feature to explore all available databases - - Fixed synchronization of database with AggregationFunction type + - Made the connector is available on [Metabase Cloud](https://www.metabase.com/start/) + - Added a feature to explore all available databases + - Fixed synchronization of database with AggregationFunction type - [DBT-clickhouse](/integrations/data-ingestion/etl-tools/dbt/index.md): Added support for the latest DBT version v1.4.1 - [Python client](/integrations/language-clients/python/index.md): Improved proxy and ssh tunneling support; added a number of fixes and performance optimizations for Pandas DataFrames - [Nodejs client](/integrations/language-clients/js.md): Released ability to attach `query_id` to query result, which can be used to retrieve query metrics from the `system.query_log` @@ -1254,8 +1245,8 @@ This release brings an officially supported Metabase integration, a major Java c - [Go](/integrations/language-clients/go/index.md) client: [Bug fixes](https://github.com/ClickHouse/clickhouse-go/blob/main/CHANGELOG.md): close canceled connections, better handling of connection errors - [JS](/integrations/language-clients/js.md) client: [Breaking changes in exec/insert](https://github.com/ClickHouse/clickhouse-js/releases/tag/0.0.12); exposed query_id in the return types - [Java](https://github.com/ClickHouse/clickhouse-java#readme) client / JDBC driver major release - - [Breaking changes](https://github.com/ClickHouse/clickhouse-java/releases): deprecated methods, classes and packages were removed - - Added R2DBC driver and file insert support + - [Breaking changes](https://github.com/ClickHouse/clickhouse-java/releases): deprecated methods, classes and packages were removed + - Added R2DBC driver and file insert support ### Console changes {#console-changes-25} - Added support for views and materialized views in SQL console @@ -1266,7 +1257,6 @@ This release brings an officially supported Metabase integration, a major Java c - Fixed a bug where SQL console CSV export was truncated - Fixed a bug resulting in intermittent sample data upload failures - ## January 12, 2023 {#january-12-2023} This release updates the ClickHouse version to 22.12, enables dictionaries for many new sources, and improves query performance. @@ -1288,14 +1278,14 @@ This release updates the ClickHouse version to 22.12, enables dictionaries for m ### Integrations changes {#integrations-changes-25} - DBT release [v1.3.2](https://github.com/ClickHouse/dbt-clickhouse/blob/main/CHANGELOG.md#release-132-2022-12-23) - - Added experimental support for the delete+insert incremental strategy - - New s3source macro + - Added experimental support for the delete+insert incremental strategy + - New s3source macro - Python client [v0.4.8](https://github.com/ClickHouse/clickhouse-connect/blob/main/CHANGELOG.md#048-2023-01-02) - - File insert support - - Server-side query [parameters binding](/interfaces/cli.md/#cli-queries-with-parameters) + - File insert support + - Server-side query [parameters binding](/interfaces/cli.md/#cli-queries-with-parameters) - Go client [v2.5.0](https://github.com/ClickHouse/clickhouse-go/releases/tag/v2.5.0) - - Reduced memory usage for compression - - Server-side query [parameters binding](/interfaces/cli.md/#cli-queries-with-parameters) + - Reduced memory usage for compression + - Server-side query [parameters binding](/interfaces/cli.md/#cli-queries-with-parameters) ### Reliability and performance {#reliability-and-performance-2} - Improved read performance for queries that fetch a large number of small files on object store @@ -1446,7 +1436,6 @@ This release removes read & write units from pricing (see the [pricing page](htt - Added functions for Morton curve encoding, Java integer hashing, and random number generation. - See the [detailed 22.10 changelog](/whats-new/cloud#clickhouse-2210-version-upgrade) for the complete list of changes. - ## October 25, 2022 {#october-25-2022} This release significantly lowers compute consumption for small workloads, lowers compute pricing (see [pricing](https://clickhouse.com/pricing) page for details), improves stability through better defaults, and enhances the Billing and Usage views in the ClickHouse Cloud console. diff --git a/docs/cloud/reference/cloud-compatibility.md b/docs/cloud/reference/cloud-compatibility.md index 62cdfcb8710..3d22ac23558 100644 --- a/docs/cloud/reference/cloud-compatibility.md +++ b/docs/cloud/reference/cloud-compatibility.md @@ -25,10 +25,10 @@ ClickHouse Cloud provides access to a curated set of capabilities in the open so ### DDL syntax {#ddl-syntax} For the most part, the DDL syntax of ClickHouse Cloud should match what is available in self-managed installs. A few notable exceptions: - - Support for `CREATE AS SELECT`, which is currently not available. As a workaround, we suggest using `CREATE ... EMPTY ... AS SELECT` and then inserting into that table (see [this blog](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1) for an example). - - Some experimental syntax may be disabled, for instance, `ALTER TABLE ... MODIFY QUERY` statement. - - Some introspection functionality may be disabled for security purposes, for example, the `addressToLine` SQL function. - - Do not use `ON CLUSTER` parameters in ClickHouse Cloud - these are not needed. While these are mostly no-op functions, they can still cause an error if you are trying to use [macros](/operations/server-configuration-parameters/settings#macros). Macros often do not work and are not needed in ClickHouse Cloud. +- Support for `CREATE AS SELECT`, which is currently not available. As a workaround, we suggest using `CREATE ... EMPTY ... AS SELECT` and then inserting into that table (see [this blog](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1) for an example). +- Some experimental syntax may be disabled, for instance, `ALTER TABLE ... MODIFY QUERY` statement. +- Some introspection functionality may be disabled for security purposes, for example, the `addressToLine` SQL function. +- Do not use `ON CLUSTER` parameters in ClickHouse Cloud - these are not needed. While these are mostly no-op functions, they can still cause an error if you are trying to use [macros](/operations/server-configuration-parameters/settings#macros). Macros often do not work and are not needed in ClickHouse Cloud. ### Database and table engines {#database-and-table-engines} @@ -36,33 +36,33 @@ ClickHouse Cloud provides a highly-available, replicated service by default. As **Supported table engines** - - ReplicatedMergeTree (default, when none is specified) - - ReplicatedSummingMergeTree - - ReplicatedAggregatingMergeTree - - ReplicatedReplacingMergeTree - - ReplicatedCollapsingMergeTree - - ReplicatedVersionedCollapsingMergeTree - - MergeTree (converted to ReplicatedMergeTree) - - SummingMergeTree (converted to ReplicatedSummingMergeTree) - - AggregatingMergeTree (converted to ReplicatedAggregatingMergeTree) - - ReplacingMergeTree (converted to ReplicatedReplacingMergeTree) - - CollapsingMergeTree (converted to ReplicatedCollapsingMergeTree) - - VersionedCollapsingMergeTree (converted to ReplicatedVersionedCollapsingMergeTree) - - URL - - View - - MaterializedView - - GenerateRandom - - Null - - Buffer - - Memory - - Deltalake - - Hudi - - MySQL - - MongoDB - - NATS - - RabbitMQ - - PostgreSQL - - S3 +- ReplicatedMergeTree (default, when none is specified) +- ReplicatedSummingMergeTree +- ReplicatedAggregatingMergeTree +- ReplicatedReplacingMergeTree +- ReplicatedCollapsingMergeTree +- ReplicatedVersionedCollapsingMergeTree +- MergeTree (converted to ReplicatedMergeTree) +- SummingMergeTree (converted to ReplicatedSummingMergeTree) +- AggregatingMergeTree (converted to ReplicatedAggregatingMergeTree) +- ReplacingMergeTree (converted to ReplicatedReplacingMergeTree) +- CollapsingMergeTree (converted to ReplicatedCollapsingMergeTree) +- VersionedCollapsingMergeTree (converted to ReplicatedVersionedCollapsingMergeTree) +- URL +- View +- MaterializedView +- GenerateRandom +- Null +- Buffer +- Memory +- Deltalake +- Hudi +- MySQL +- MongoDB +- NATS +- RabbitMQ +- PostgreSQL +- S3 ### Interfaces {#interfaces} ClickHouse Cloud supports HTTPS, native interfaces, and the [MySQL wire protocol](/interfaces/mysql). Support for more interfaces such as Postgres is coming soon. @@ -72,14 +72,14 @@ Dictionaries are a popular way to speed up lookups in ClickHouse. ClickHouse Cl ### Federated queries {#federated-queries} We support federated ClickHouse queries for cross-cluster communication in the cloud, and for communication with external self-managed ClickHouse clusters. ClickHouse Cloud currently supports federated queries using the following integration engines: - - Deltalake - - Hudi - - MySQL - - MongoDB - - NATS - - RabbitMQ - - PostgreSQL - - S3 +- Deltalake +- Hudi +- MySQL +- MongoDB +- NATS +- RabbitMQ +- PostgreSQL +- S3 Federated queries with some external database and table engines, such as SQLite, ODBC, JDBC, Redis, HDFS and Hive are not yet supported. @@ -108,9 +108,9 @@ The following are default settings for ClickHouse Cloud services. In some cases, The default value of the `max_parts_in_total` setting for MergeTree tables has been lowered from 100,000 to 10,000. The reason for this change is that we observed that a large number of data parts is likely to cause a slow startup time of services in the cloud. A large number of parts usually indicate a choice of too granular partition key, which is typically done accidentally and should be avoided. The change of default will allow the detection of these cases earlier. #### `max_concurrent_queries: 1,000` {#max_concurrent_queries-1000} -Increased this per-server setting from the default of `100` to `1000` to allow for more concurrency. -This will result in `number of replicas * 1,000` concurrent queries for the offered tier services. -`1000` concurrent queries for Basic tier service limited to a single replica and `1000+` for Scale and Enterprise, +Increased this per-server setting from the default of `100` to `1000` to allow for more concurrency. +This will result in `number of replicas * 1,000` concurrent queries for the offered tier services. +`1000` concurrent queries for Basic tier service limited to a single replica and `1000+` for Scale and Enterprise, depending on the number of replicas configured. #### `max_table_size_to_drop: 1,000,000,000,000` {#max_table_size_to_drop-1000000000000} diff --git a/docs/cloud/reference/shared-catalog.md b/docs/cloud/reference/shared-catalog.md index fa474c41b74..9f3a078b325 100644 --- a/docs/cloud/reference/shared-catalog.md +++ b/docs/cloud/reference/shared-catalog.md @@ -38,29 +38,29 @@ Shared database engine builds on and improves the behavior of the Replicated dat ### Key benefits {#key-benefits} - **Atomic CREATE TABLE ... AS SELECT** - Table creation and data insertion are executed atomically—either the entire operation completes, or the table is not created at all. + Table creation and data insertion are executed atomically—either the entire operation completes, or the table is not created at all. - **RENAME TABLE between databases** - Enables atomic movement of tables across databases: - ```sql - RENAME TABLE db1.table TO db2.table; - ``` + Enables atomic movement of tables across databases: + ```sql + RENAME TABLE db1.table TO db2.table; + ``` - **Automatic table recovery with UNDROP TABLE** - Dropped tables are retained for a default period of 8 hours and can be restored: - ```sql - UNDROP TABLE my_table; - ``` - The retention window is configurable via server settings. + Dropped tables are retained for a default period of 8 hours and can be restored: + ```sql + UNDROP TABLE my_table; + ``` + The retention window is configurable via server settings. - **Improved compute-compute separation** - Unlike the Replicated database engine, which requires all replicas to be online to process a DROP query, Shared Catalog performs centralized metadata deletion. This allows operations to succeed even when some replicas are offline. + Unlike the Replicated database engine, which requires all replicas to be online to process a DROP query, Shared Catalog performs centralized metadata deletion. This allows operations to succeed even when some replicas are offline. - **Automatic metadata replication** - Shared Catalog ensures that database definitions are automatically replicated to all servers on startup. Operators do not need to manually configure or synchronize metadata on new instances. + Shared Catalog ensures that database definitions are automatically replicated to all servers on startup. Operators do not need to manually configure or synchronize metadata on new instances. - **Centralized, versioned metadata state** - Shared Catalog stores a single source of truth in ZooKeeper. When a replica starts, it fetches the latest state and applies the diff to reach consistency. During query execution, the system can wait for other replicas to reach at least the required version of metadata to ensure correctness. + Shared Catalog stores a single source of truth in ZooKeeper. When a replica starts, it fetches the latest state and applies the diff to reach consistency. During query execution, the system can wait for other replicas to reach at least the required version of metadata to ensure correctness. ## Usage in ClickHouse Cloud {#usage-in-clickhouse-cloud} @@ -82,4 +82,4 @@ Shared Catalog and the Shared database engine provide: - Improved support for elastic, ephemeral, or partially offline compute environments - Seamless usage for ClickHouse Cloud users -These capabilities make Shared Catalog the foundation for scalable, cloud-native metadata management in ClickHouse Cloud. + These capabilities make Shared Catalog the foundation for scalable, cloud-native metadata management in ClickHouse Cloud. diff --git a/docs/cloud/reference/shared-merge-tree.md b/docs/cloud/reference/shared-merge-tree.md index 589e96a7a0e..77707449b6f 100644 --- a/docs/cloud/reference/shared-merge-tree.md +++ b/docs/cloud/reference/shared-merge-tree.md @@ -10,7 +10,6 @@ import shared_merge_tree from '@site/static/images/cloud/reference/shared-merge- import shared_merge_tree_2 from '@site/static/images/cloud/reference/shared-merge-tree-2.png'; import Image from '@theme/IdealImage'; - # SharedMergeTree table engine The SharedMergeTree table engine family is a cloud-native replacement of the ReplicatedMergeTree engines that is optimized to work on top of shared storage (e.g. Amazon S3, Google Cloud Storage, MinIO, Azure Blob Storage). There is a SharedMergeTree analog for every specific MergeTree engine type, i.e. ReplacingSharedMergeTree replaces ReplacingReplicatedMergeTree. @@ -23,15 +22,15 @@ The SharedMergeTree table engine family powers ClickHouse Cloud. For an end-user - Faster scale-up and scale-down operations - More lightweight strong consistency for select queries -A significant improvement that the SharedMergeTree brings is that it provides a deeper separation of compute and storage compared to the ReplicatedMergeTree. You can see below how the ReplicatedMergeTree separate the compute and storage: + A significant improvement that the SharedMergeTree brings is that it provides a deeper separation of compute and storage compared to the ReplicatedMergeTree. You can see below how the ReplicatedMergeTree separate the compute and storage: -ReplicatedMergeTree Diagram + ReplicatedMergeTree Diagram -As you can see, even though the data stored in the ReplicatedMergeTree are in object storage, the metadata still resides on each of the clickhouse-servers. This means that for every replicated operation, metadata also needs to be replicated on all replicas. + As you can see, even though the data stored in the ReplicatedMergeTree are in object storage, the metadata still resides on each of the clickhouse-servers. This means that for every replicated operation, metadata also needs to be replicated on all replicas. -ReplicatedMergeTree Diagram with Metadata + ReplicatedMergeTree Diagram with Metadata -Unlike ReplicatedMergeTree, SharedMergeTree doesn't require replicas to communicate with each other. Instead, all communication happens through shared storage and clickhouse-keeper. SharedMergeTree implements asynchronous leaderless replication and uses clickhouse-keeper for coordination and metadata storage. This means that metadata doesn't need to be replicated as your service scales up and down. This leads to faster replication, mutation, merges and scale-up operations. SharedMergeTree allows for hundreds of replicas for each table, making it possible to dynamically scale without shards. A distributed query execution approach is used in ClickHouse Cloud to utilize more compute resources for a query. + Unlike ReplicatedMergeTree, SharedMergeTree doesn't require replicas to communicate with each other. Instead, all communication happens through shared storage and clickhouse-keeper. SharedMergeTree implements asynchronous leaderless replication and uses clickhouse-keeper for coordination and metadata storage. This means that metadata doesn't need to be replicated as your service scales up and down. This leads to faster replication, mutation, merges and scale-up operations. SharedMergeTree allows for hundreds of replicas for each table, making it possible to dynamically scale without shards. A distributed query execution approach is used in ClickHouse Cloud to utilize more compute resources for a query. ## Introspection {#introspection} @@ -41,7 +40,6 @@ Most of the system tables used for introspection of ReplicatedMergeTree exist fo This table serves as the alternative to `system.replication_queue` for SharedMergeTree. It stores information about the most recent set of current parts, as well as future parts in progress such as merges, mutations, and dropped partitions. - **system.shared_merge_tree_fetches** This table is the alternative to `system.replicated_fetches` SharedMergeTree. It contains information about current in-progress fetches of primary keys and checksums into memory. diff --git a/docs/cloud/reference/supported-regions.md b/docs/cloud/reference/supported-regions.md index c9b4d0ab549..0c5d248a9d7 100644 --- a/docs/cloud/reference/supported-regions.md +++ b/docs/cloud/reference/supported-regions.md @@ -24,13 +24,13 @@ import EnterprisePlanFeatureBadge from '@theme/badges/EnterprisePlanFeatureBadge - us-east-2 (Ohio) - us-west-2 (Oregon) -**Private Region:** + **Private Region:** - ca-central-1 (Canada) - af-south-1 (South Africa) - eu-north-1 (Stockholm) - sa-east-1 (South America) - ap-northeast-2 (South Korea, Seoul) - + ## Google Cloud regions {#google-cloud-regions} - asia-southeast1 (Singapore) @@ -38,7 +38,7 @@ import EnterprisePlanFeatureBadge from '@theme/badges/EnterprisePlanFeatureBadge - us-central1 (Iowa) - us-east1 (South Carolina) -**Private Region:** + **Private Region:** - us-west1 (Oregon) - australia-southeast1(Sydney) @@ -53,12 +53,12 @@ import EnterprisePlanFeatureBadge from '@theme/badges/EnterprisePlanFeatureBadge - East US 2 (Virginia) - Germany West Central (Frankfurt) -**Private Region:** + **Private Region:** -JapanEast -:::note -Need to deploy to a region not currently listed? [Submit a request](https://clickhouse.com/pricing?modal=open). -::: + JapanEast + :::note + Need to deploy to a region not currently listed? [Submit a request](https://clickhouse.com/pricing?modal=open). + ::: ## Private regions {#private-regions} @@ -72,8 +72,8 @@ Key considerations for private regions: - Manual scaling (both vertical and horizontal) can be enabled with a support ticket. - If a service requires configuration with CMEK, the customer must provide the AWS KMS key during service launch. - To launch services new and additional, requests will need to be made through a support ticket. - -Additional requirements may apply for HIPAA compliance (including signing a BAA). Note that HIPAA is currently available only for Enterprise tier services + + Additional requirements may apply for HIPAA compliance (including signing a BAA). Note that HIPAA is currently available only for Enterprise tier services ## HIPAA compliant regions {#hipaa-compliant-regions} diff --git a/docs/cloud/reference/warehouses.md b/docs/cloud/reference/warehouses.md index 675c3f51b5e..b5a2bc9e236 100644 --- a/docs/cloud/reference/warehouses.md +++ b/docs/cloud/reference/warehouses.md @@ -25,27 +25,23 @@ Each ClickHouse Cloud service includes: - An endpoint (or multiple endpoints created via ClickHouse Cloud UI console), which is a service URL that you use to connect to the service (for example, `https://dv2fzne24g.us-east-1.aws.clickhouse.cloud:8443`). - An object storage folder where the service stores all the data and partially metadata: -:::note -Child single services can scale vertically unlike single parent services. -::: - -Current service in ClickHouse Cloud + :::note + Child single services can scale vertically unlike single parent services. + ::: -
+ Current service in ClickHouse Cloud -_Fig. 1 - current service in ClickHouse Cloud_ + _Fig. 1 - current service in ClickHouse Cloud_ -Compute-compute separation allows users to create multiple compute node groups, each with its own endpoint, that are using the same object storage folder, and thus, with the same tables, views, etc. + Compute-compute separation allows users to create multiple compute node groups, each with its own endpoint, that are using the same object storage folder, and thus, with the same tables, views, etc. -Each compute node group will have its own endpoint so you can choose which set of replicas to use for your workloads. Some of your workloads may be satisfied with only one small-size replica, and others may require full high-availability (HA) and hundreds of gigs of memory. Compute-compute separation also allows you to separate read operations from write operations so they don't interfere with each other: + Each compute node group will have its own endpoint so you can choose which set of replicas to use for your workloads. Some of your workloads may be satisfied with only one small-size replica, and others may require full high-availability (HA) and hundreds of gigs of memory. Compute-compute separation also allows you to separate read operations from write operations so they don't interfere with each other: -Compute separation in ClickHouse Cloud - -
+ Compute separation in ClickHouse Cloud -_Fig. 2 - compute separation in ClickHouse Cloud_ + _Fig. 2 - compute separation in ClickHouse Cloud_ -It is possible to create extra services that share the same data with your existing services, or create a completely new setup with multiple services sharing the same data. + It is possible to create extra services that share the same data with your existing services, or create a completely new setup with multiple services sharing the same data. ## What is a warehouse? {#what-is-a-warehouse} @@ -55,19 +51,17 @@ Each warehouse has a primary service (this service was created first) and second - Primary service `DWH Prod` - Secondary service `DWH Prod Subservice` -Warehouse example with primary and secondary services + Warehouse example with primary and secondary services -
- -_Fig. 3 - Warehouse example_ + _Fig. 3 - Warehouse example_ -All services in a warehouse share the same: + All services in a warehouse share the same: - Region (for example, us-east1) - Cloud service provider (AWS, GCP or Azure) - ClickHouse database version -You can sort services by the warehouse that they belong to. + You can sort services by the warehouse that they belong to. ## Access controls {#access-controls} @@ -106,8 +100,7 @@ _Fig. 6 - Read-write and Read-only services in a warehouse_ :::note 1. Read-only services currently allow user management operations (create, drop, etc). This behavior may be changed in the future. 2. Currently, refreshable materialized views are executed on all services in the warehouse, including read-only services. This behavior will be changed in the future, however, and they will be executed on RW services only. -::: - + ::: ## Scaling {#scaling} @@ -134,16 +127,15 @@ Once compute-compute is enabled for a service (at least one secondary service wa 6. **CREATE/RENAME/DROP DATABASE queries could be blocked by idled/stopped services by default.** These queries can hang. To bypass this, you can run database management queries with `settings distributed_ddl_task_timeout=0` at the session or per query level. For example: -```sql -CREATE DATABASE db_test_ddl_single_query_setting -SETTINGS distributed_ddl_task_timeout=0 -``` + ```sql + CREATE DATABASE db_test_ddl_single_query_setting + SETTINGS distributed_ddl_task_timeout=0 + ``` 6. **In very rare cases, secondary services that are idled or stopped for a long time (days) without waking/starting up can cause performance degradation to other services in the same warehouse.** This issue will be resolved soon and is connected to mutations running in the background. If you think you are experiencing this issue, please contact ClickHouse [Support](https://clickhouse.com/support/program). 7. **Currently there is a soft limit of 5 services per warehouse.** Contact the support team if you need more than 5 services in a single warehouse. - ## Pricing {#pricing} Compute prices are the same for all services in a warehouse (primary and secondary). Storage is billed only once - it is included in the first (original) service. diff --git a/docs/cloud/security/accessing-s3-data-securely.md b/docs/cloud/security/accessing-s3-data-securely.md index 7503cdccf9c..f86b42b86eb 100644 --- a/docs/cloud/security/accessing-s3-data-securely.md +++ b/docs/cloud/security/accessing-s3-data-securely.md @@ -58,7 +58,6 @@ This approach allows customers to manage all access to their S3 buckets in a sin *Note*: Do not put the full bucket Arn but instead just the bucket name only. - 5 - Select the **I acknowledge that AWS CloudFormation might create IAM resources with custom names.** checkbox 6 - Click **Create stack** button at bottom right @@ -136,7 +135,6 @@ ClickHouse Cloud has a new feature that allows you to specify `extra_credentials DESCRIBE TABLE s3('https://s3.amazonaws.com/BUCKETNAME/BUCKETOBJECT.csv','CSVWithNames',extra_credentials(role_arn = 'arn:aws:iam::111111111111:role/ClickHouseAccessRole-001')) ``` - Below is an example query that uses the `role_session_name` as a shared secret to query data from a bucket. If the `role_session_name` is not correct, this operation will fail. ```sql diff --git a/docs/cloud/security/audit-logging.md b/docs/cloud/security/audit-logging.md index 178cfeb5ffa..c848dbd7dc2 100644 --- a/docs/cloud/security/audit-logging.md +++ b/docs/cloud/security/audit-logging.md @@ -10,7 +10,7 @@ import activity_log_1 from '@site/static/images/cloud/security/activity_log1.png import activity_log_2 from '@site/static/images/cloud/security/activity_log2.png'; import activity_log_3 from '@site/static/images/cloud/security/activity_log3.png'; -In ClickHouse Cloud, navigate to your organization details. +In ClickHouse Cloud, navigate to your organization details. ClickHouse Cloud activity tab @@ -18,24 +18,20 @@ In ClickHouse Cloud, navigate to your organization details. Select the **Audit** tab on the left menu to see what changes have been made to your ClickHouse Cloud organization - including who made the change and when it occurred. - The **Activity** page displays a table containing a list of events logged about your organization. By default, this list is sorted in a reverse-chronological order (most-recent event at the top). Change the order of the table by clicking on the columns headers. Each item of the table contains the following fields: - - **Activity:** A text snippet describing the event - **User:** The user that initiated the event - **IP Address:** When applicable, this flied lists the IP Address of the user that initiated the event - **Time:** The timestamp of the event -ClickHouse Cloud Activity Table - -
+ ClickHouse Cloud Activity Table -You can use the search bar provided to isolate events based on some criteria like for example service name or IP address. You can also export this information in a CSV format for distribution or analysis in an external tool. + You can use the search bar provided to isolate events based on some criteria like for example service name or IP address. You can also export this information in a CSV format for distribution or analysis in an external tool. -
+
ClickHouse Cloud Activity CSV export -
+
## List of events logged {#list-of-events-logged} @@ -68,5 +64,5 @@ The different types of events captured for the organization are grouped in 3 cat ## API for audit events {#api-for-audit-events} -Users can use the ClickHouse Cloud API `activity` endpoint to obtain an export +Users can use the ClickHouse Cloud API `activity` endpoint to obtain an export of audit events. Further details can be found in the [API reference](https://clickhouse.com/docs/cloud/manage/api/swagger). diff --git a/docs/cloud/security/aws-privatelink.md b/docs/cloud/security/aws-privatelink.md index 56bde44c5c6..0cc3c2fecd7 100644 --- a/docs/cloud/security/aws-privatelink.md +++ b/docs/cloud/security/aws-privatelink.md @@ -28,22 +28,18 @@ To restrict access to your ClickHouse Cloud services exclusively through AWS Pri ClickHouse Cloud currently supports [cross-region PrivateLink](https://aws.amazon.com/about-aws/whats-new/2024/11/aws-privatelink-across-region-connectivity/) in beta. ::: - **Please complete the following to enable AWS PrivateLink**: 1. Obtain Endpoint "Service name". 1. Create AWS Endpoint. 1. Add "Endpoint ID" to ClickHouse Cloud organization. 1. Add "Endpoint ID" to ClickHouse service allow list. - -Find Terraform examples [here](https://github.com/ClickHouse/terraform-provider-clickhouse/tree/main/examples/). - + Find Terraform examples [here](https://github.com/ClickHouse/terraform-provider-clickhouse/tree/main/examples/). ## Points of attention {#attention} ClickHouse attempts to group your services to reuse the same published [service endpoint](https://docs.aws.amazon.com/vpc/latest/privatelink/privatelink-share-your-services.html#endpoint-service-overview) within the AWS region. However, this grouping is not guaranteed, especially if you spread your services across multiple ClickHouse organizations. If you already have PrivateLink configured for other services in your ClickHouse organization, you can often skip most of the steps because of that grouping and proceed directly to the final step: Add ClickHouse "Endpoint ID" to ClickHouse service allow list. - ## Prerequisites for this process {#prerequisites} Before you get started you will need: @@ -108,9 +104,9 @@ Make a note of the `endpointServiceId` and `privateDnsHostname` [move onto next ### Create AWS endpoint {#create-aws-endpoint} :::important -This section covers ClickHouse-specific details for configuring ClickHouse via AWS PrivateLink. AWS-specific steps are provided as a reference to guide you on where to look, but they may change over time without notice from the AWS cloud provider. Please consider AWS configuration based on your specific use case. +This section covers ClickHouse-specific details for configuring ClickHouse via AWS PrivateLink. AWS-specific steps are provided as a reference to guide you on where to look, but they may change over time without notice from the AWS cloud provider. Please consider AWS configuration based on your specific use case. -Please note that ClickHouse is not responsible for configuring the required AWS VPC endpoints, security group rules or DNS records. +Please note that ClickHouse is not responsible for configuring the required AWS VPC endpoints, security group rules or DNS records. If you previously enabled "private DNS names" while setting up PrivateLink and are experiencing difficulties configuring new services via PrivateLink, please contact ClickHouse support. For any other issues related to AWS configuration tasks, contact AWS Support directly. ::: @@ -159,11 +155,11 @@ Resources: - subnet-subnet_id1 - subnet-subnet_id2 - subnet-subnet_id3 - SecurityGroupIds: + SecurityGroupIds: - sg-security_group_id1 - sg-security_group_id2 - sg-security_group_id3 -``` + ``` After creating the VPC Endpoint, make a note of the `Endpoint ID` value; you'll need it for an upcoming step. @@ -207,7 +203,7 @@ If you want to allow access from an existing PrivateLink connection, use the exi Private Endpoints Filter -To remove please navigate to the ClickHouse Cloud console, find the service, then navigate to **Settings** of the service, find endpoint you would like to remove.Remove it from the list of endpoints. +To remove please navigate to the ClickHouse Cloud console, find the service, then navigate to **Settings** of the service, find endpoint you would like to remove.Remove it from the list of endpoints. #### Option 2: API {#option-2-api-2} @@ -268,7 +264,6 @@ curl --silent --user "${KEY_ID:?}:${KEY_SECRET:?}" \ Each service with Private Link enabled has a public and private endpoint. In order to connect using Private Link, you need to use a private endpoint which will be `privateDnsHostname`API or `DNS Name`console taken from [Obtain Endpoint "Service name"](#obtain-endpoint-service-info). - #### Getting private DNS hostname {#getting-private-dns-hostname} ##### Option 1: ClickHouse Cloud console {#option-1-clickhouse-cloud-console-3} diff --git a/docs/cloud/security/azure-privatelink.md b/docs/cloud/security/azure-privatelink.md index f213496c7a7..e22ebcc3ebb 100644 --- a/docs/cloud/security/azure-privatelink.md +++ b/docs/cloud/security/azure-privatelink.md @@ -47,7 +47,6 @@ Additional charges may be applied to inter-region traffic. Please check latest A 1. Add the Private Endpoint GUID to your service(s) allow list 1. Access your ClickHouse Cloud service using Private Link - ## Attention {#attention} ClickHouse attempts to group your services to reuse the same published [Private Link service](https://learn.microsoft.com/en-us/azure/private-link/private-link-service-overview) within the Azure region. However, this grouping is not guaranteed, especially if you spread your services across multiple ClickHouse organizations. If you already have Private Link configured for other services in your ClickHouse organization, you can often skip most of the steps because of that grouping and proceed directly to the final step: [Add the Private Endpoint GUID to your service(s) allow list](#add-private-endpoint-guid-to-services-allow-list). @@ -102,9 +101,9 @@ Make a note of the `endpointServiceId`. You'll use it in the next step. ## Create a private endpoint in Azure {#create-private-endpoint-in-azure} :::important -This section covers ClickHouse-specific details for configuring ClickHouse via Azure Private Link. Azure-specific steps are provided as a reference to guide you on where to look, but they may change over time without notice from the Azure cloud provider. Please consider Azure configuration based on your specific use case. +This section covers ClickHouse-specific details for configuring ClickHouse via Azure Private Link. Azure-specific steps are provided as a reference to guide you on where to look, but they may change over time without notice from the Azure cloud provider. Please consider Azure configuration based on your specific use case. -Please note that ClickHouse is not responsible for configuring the required Azure private endpoints, DNS records. +Please note that ClickHouse is not responsible for configuring the required Azure private endpoints, DNS records. For any issues related to Azure configuration tasks, contact Azure Support directly. ::: @@ -129,56 +128,56 @@ In the following screen, specify the following options: - **Name**: Set a name for the **Private Endpoint**. - **Region**: Choose region where the deployed VNet that will be connected to ClickHouse Cloud via Private Link. -After you have completed the above steps, click the **Next: Resource** button. + After you have completed the above steps, click the **Next: Resource** button. -Create Private Endpoint Basic + Create Private Endpoint Basic ---- + --- -Select the option **Connect to an Azure resource by resource ID or alias**. + Select the option **Connect to an Azure resource by resource ID or alias**. -For the **Resource ID or alias**, use the `endpointServiceId` you have obtained from the [Obtain Azure connection alias for Private Link](#obtain-azure-connection-alias-for-private-link) step. + For the **Resource ID or alias**, use the `endpointServiceId` you have obtained from the [Obtain Azure connection alias for Private Link](#obtain-azure-connection-alias-for-private-link) step. -Click **Next: Virtual Network** button. + Click **Next: Virtual Network** button. -Private Endpoint Resource Selection + Private Endpoint Resource Selection ---- + --- - **Virtual network**: Choose the VNet you want to connect to ClickHouse Cloud using Private Link - **Subnet**: Choose the subnet where Private Endpoint will be created -Optional: + Optional: - **Application security group**: You can attach ASG to Private Endpoint and use it in Network Security Groups to filter network traffic to/from Private Endpoint. -Click **Next: DNS** button. + Click **Next: DNS** button. -Private Endpoint Virtual Network Selection + Private Endpoint Virtual Network Selection -Click the **Next: Tags** button. + Click the **Next: Tags** button. ---- + --- -Private Endpoint DNS Configuration + Private Endpoint DNS Configuration -Optionally, you can attach tags to your Private Endpoint. + Optionally, you can attach tags to your Private Endpoint. -Click the **Next: Review + create** button. + Click the **Next: Review + create** button. ---- + --- -Private Endpoint Tags + Private Endpoint Tags -Finally, click the **Create** button. + Finally, click the **Create** button. -Private Endpoint Review + Private Endpoint Review -The **Connection status** of the created Private Endpoint will be in **Pending** state. It will change to **Approved** state once you add this Private Endpoint to the service allow list. + The **Connection status** of the created Private Endpoint will be in **Pending** state. It will change to **Approved** state once you add this Private Endpoint to the service allow list. -Open the network interface associated with Private Endpoint and copy the **Private IPv4 address**(10.0.0.4 in this example), you will need this information in the next steps. + Open the network interface associated with Private Endpoint and copy the **Private IPv4 address**(10.0.0.4 in this example), you will need this information in the next steps. -Private Endpoint IP Address + Private Endpoint IP Address ### Option 2: Using Terraform to create a private endpoint in Azure {#option-2-using-terraform-to-create-a-private-endpoint-in-azure} @@ -244,21 +243,21 @@ Create a wildcard record and point to your Private Endpoint: 4. For IP Address, type the IP address you see for Private Endpoint. 5. Select **OK**. -Private Link DNS Wildcard Setup + Private Link DNS Wildcard Setup -**Option 2: Using Terraform** + **Option 2: Using Terraform** -Use the following Terraform template to create a wildcard DNS record: + Use the following Terraform template to create a wildcard DNS record: -```json -resource "azurerm_private_dns_a_record" "example" { - name = "*" - zone_name = var.zone_name - resource_group_name = var.resource_group_name - ttl = 300 - records = ["10.0.0.4"] -} -``` + ```json + resource "azurerm_private_dns_a_record" "example" { + name = "*" + zone_name = var.zone_name + resource_group_name = var.resource_group_name + ttl = 300 + records = ["10.0.0.4"] + } + ``` ### Create a virtual network link {#create-a-virtual-network-link} @@ -276,7 +275,6 @@ There are various ways to configure DNS. Please set up DNS according to your spe You need to point "DNS name", taken from [Obtain Azure connection alias for Private Link](#obtain-azure-connection-alias-for-private-link) step, to Private Endpoint IP address. This ensures that services/components within your VPC/Network can resolve it properly. - ### Verify DNS setup {#verify-dns-setup} `xxxxxxxxxx.westus3.privatelink.azure.clickhouse.cloud` domain should be pointed to the Private Endpoint IP. (10.0.0.4 in this example). @@ -426,7 +424,6 @@ curl --silent --user "${KEY_ID:?}:${KEY_SECRET:?}" -X PATCH -H "Content-Type: ap Each service with Private Link enabled has a public and private endpoint. In order to connect using Private Link, you need to use a private endpoint which will be `privateDnsHostname`API or `DNS name`console taken from [Obtain Azure connection alias for Private Link](#obtain-azure-connection-alias-for-private-link). - ### Obtaining the private DNS hostname {#obtaining-the-private-dns-hostname} #### Option 1: ClickHouse Cloud console {#option-1-clickhouse-cloud-console-3} diff --git a/docs/cloud/security/cloud-access-management/cloud-access-management.md b/docs/cloud/security/cloud-access-management/cloud-access-management.md index 3add5cc3aaa..23f846aac4a 100644 --- a/docs/cloud/security/cloud-access-management/cloud-access-management.md +++ b/docs/cloud/security/cloud-access-management/cloud-access-management.md @@ -19,23 +19,22 @@ Users must be assigned an organization level role and may optionally be assigned - Users added to an organization via a SAML integration are automatically assigned the Member role, with least privilege and without access to any services until configured. - Service Admin is assigned the SQL console admin role by default. SQL console permissions may be removed in the service settings page. + | Context | Role | Description | + |:-------------|:-----------------------|:-------------------------------------------------| + | Organization | Admin | Perform all administrative activities for an organization and control all settings. Assigned to the first user in the organization by default. | + | Organization | Developer | View access to everything except Services, ability to generate read-only API keys. | + | Organization | Billing | View usage and invoices, and manage payment methods. | + | Organization | Member | Sign-in only with the ability to manage personal profile settings. Assigned to SAML SSO users by default. | + | Service | Service Admin | Manage service settings. | + | Service | Service Read Only | View services and settings. | + | SQL console | SQL console admin | Administrative access to databases within the service equivalent to the Default database role. | + | SQL console | SQL console read only | Read only access to databases within the service | + | SQL console | Custom | Configure using SQL [`GRANT`](/sql-reference/statements/grant) statement; assign the role to a SQL console user by naming the role after the user | + + To create a custom role for a SQL console user and grant it a general role, run the following commands. The email address must match the user's email address in the console. -| Context | Role | Description | -|:-------------|:-----------------------|:-------------------------------------------------| -| Organization | Admin | Perform all administrative activities for an organization and control all settings. Assigned to the first user in the organization by default. | -| Organization | Developer | View access to everything except Services, ability to generate read-only API keys. | -| Organization | Billing | View usage and invoices, and manage payment methods. | -| Organization | Member | Sign-in only with the ability to manage personal profile settings. Assigned to SAML SSO users by default. | -| Service | Service Admin | Manage service settings. | -| Service | Service Read Only | View services and settings. | -| SQL console | SQL console admin | Administrative access to databases within the service equivalent to the Default database role. | -| SQL console | SQL console read only | Read only access to databases within the service | -| SQL console | Custom | Configure using SQL [`GRANT`](/sql-reference/statements/grant) statement; assign the role to a SQL console user by naming the role after the user | - -To create a custom role for a SQL console user and grant it a general role, run the following commands. The email address must match the user's email address in the console. - 1. Create the database_developer role and grant `SHOW`, `CREATE`, `ALTER`, and `DELETE` permissions. - + ```sql CREATE ROLE OR REPLACE database_developer; GRANT SHOW ON * TO database_developer; @@ -43,9 +42,9 @@ To create a custom role for a SQL console user and grant it a general role, run GRANT ALTER ON * TO database_developer; GRANT DELETE ON * TO database_developer; ``` - + 2. Create a role for the SQL console user my.user@domain.com and assign it the database_developer role. - + ```sql CREATE ROLE OR REPLACE `sql-console-role:my.user@domain.com`; GRANT database_developer TO `sql-console-role:my.user@domain.com`; @@ -62,23 +61,22 @@ Configure the following within the services and databases using the SQL [GRANT]( | Default | Full administrative access to services | | Custom | Configure using the SQL [`GRANT`](/sql-reference/statements/grant) statement | - - Database roles are additive. This means if a user is a member of two roles, the user has the most access granted to the two roles. They do not lose access by adding roles. - Database roles can be granted to other roles, resulting in a hierarchical structure. Roles inherit all permissions of the roles for which it is a member. - Database roles are unique per service and may be applied across multiple databases within the same service. -The illustration below shows the different ways a user could be granted permissions. + The illustration below shows the different ways a user could be granted permissions. -An illustration showing the different ways a user could be granted permissions + An illustration showing the different ways a user could be granted permissions -### Initial settings {#initial-settings} +### Initial settings {#initial-settings} Databases have an account named `default` that is added automatically and granted the default_role upon service creation. The user that creates the service is presented with the automatically generated, random password that is assigned to the `default` account when the service is created. The password is not shown after initial setup, but may be changed by any user with Service Admin permissions in the console at a later time. This account or an account with Service Admin privileges within the console may set up additional database users and roles at any time. :::note To change the password assigned to the `default` account in the console, go to the Services menu on the left, access the service, go to the Settings tab and click the Reset password button. ::: -We recommend creating a new user account associated with a person and granting the user the default_role. This is so activities performed by users are identified to their user IDs and the `default` account is reserved for break-glass type activities. +We recommend creating a new user account associated with a person and granting the user the default_role. This is so activities performed by users are identified to their user IDs and the `default` account is reserved for break-glass type activities. ```sql CREATE USER userID IDENTIFIED WITH sha256_hash by 'hashed_password'; @@ -90,7 +88,7 @@ Users can use a SHA256 hash generator or code function such as `hashlib` in Pyth ### Database access listings with SQL console users {#database-access-listings-with-sql-console-users} The following process can be used to generate a complete access listing across the SQL console and databases in your organization. -1. Run the following queries to get a list of all grants in the database. +1. Run the following queries to get a list of all grants in the database. ```sql SELECT grants.user_name, @@ -101,9 +99,9 @@ The following process can be used to generate a complete access listing across t grants.table FROM system.grants LEFT OUTER JOIN system.role_grants ON grants.role_name = role_grants.granted_role_name LEFT OUTER JOIN system.users ON role_grants.user_name = users.name - + UNION ALL - + SELECT grants.user_name, grants.role_name, role_grants.role_name AS role_member, @@ -113,9 +111,9 @@ The following process can be used to generate a complete access listing across t FROM system.role_grants LEFT OUTER JOIN system.grants ON role_grants.granted_role_name = grants.role_name WHERE role_grants.user_name is null; ``` - + 2. Associate this list to Console users with access to SQL console. - + a. Go to the Console. b. Select the relevant service. @@ -125,4 +123,3 @@ The following process can be used to generate a complete access listing across t d. Scroll to the SQL console access section. e. Click the link for the number of users with access to the database `There are # users with access to this service.` to see the user listing. - diff --git a/docs/cloud/security/cloud-access-management/cloud-authentication.md b/docs/cloud/security/cloud-access-management/cloud-authentication.md index c0138a1d18e..bff3e3f5141 100644 --- a/docs/cloud/security/cloud-access-management/cloud-authentication.md +++ b/docs/cloud/security/cloud-access-management/cloud-authentication.md @@ -17,10 +17,10 @@ ClickHouse Cloud provides a number of ways to authenticate. This guide explains Minimum password settings for our console and services (databases) currently comply with [NIST 800-63B](https://pages.nist.gov/800-63-3/sp800-63b.html#sec4) Authenticator Assurance Level 1: - Minimum 12 characters - Includes 3 of the following 4 items: - - 1 uppercase letter - - 1 lowercase letter - - 1 number - - 1 special character + - 1 uppercase letter + - 1 lowercase letter + - 1 number + - 1 special character ## Email and password {#email--password} @@ -28,7 +28,7 @@ ClickHouse Cloud allows you to authenticate with an email address and password. ## SSO using Google or Microsoft social authentication {#sso-using-google-or-microsoft-social-authentication} -If your company uses Google Workspace or Microsoft 365, you can leverage your current single sign-on setup within ClickHouse Cloud. To do this, simply sign up using your company email address and invite other users using their company email. The effect is that your users must login using your company's login flows, whether via your identity provider or directly through Google or Microsoft authentication, before they can authenticate into ClickHouse Cloud. +If your company uses Google Workspace or Microsoft 365, you can leverage your current single sign-on setup within ClickHouse Cloud. To do this, simply sign up using your company email address and invite other users using their company email. The effect is that your users must login using your company's login flows, whether via your identity provider or directly through Google or Microsoft authentication, before they can authenticate into ClickHouse Cloud. ## Multi-factor authentication {#multi-factor-authentication} @@ -43,73 +43,63 @@ Users with email + password or social authentication can further secure their ac 8. On the next screen, copy the recovery code and store it in a safe place 9. Check the box next to `I have safely recorded this code` 10. Click Continue - + ## Account recovery {#account-recovery} -
- Obtain recovery code - - If you previously enrolled in MFA and either did not create or misplaced your recovery code, follow these steps to get a new recovery code: - 1. Go to https://console.clickhouse.cloud - 2. Sign in with your credentials and MFA - 3. Go to your profile in the upper left corner - 4. Click Security on the left - 5. Click the trash can next to your Authenticator app - 6. Click Remove authenticator app - 7. Enter your code and click Continue - 8. Click Set up in the Authenticator app section - 9. Scan the QR code and input the new code - 10. Copy your recovery code and store it in a safe place - 11. Check the box next to `I have safely recorded this code` - 12. Click Continue - +
+Obtain recovery code +If you previously enrolled in MFA and either did not create or misplaced your recovery code, follow these steps to get a new recovery code: +1. Go to https://console.clickhouse.cloud +2. Sign in with your credentials and MFA +3. Go to your profile in the upper left corner +4. Click Security on the left +5. Click the trash can next to your Authenticator app +6. Click Remove authenticator app +7. Enter your code and click Continue +8. Click Set up in the Authenticator app section +9. Scan the QR code and input the new code +10. Copy your recovery code and store it in a safe place +11. Check the box next to `I have safely recorded this code` +12. Click Continue
- Forgot password - - If you forgot your password, follow these steps for self-service recovery: - 1. Go to https://console.clickhouse.cloud - 2. Enter your email address and click Continue - 3. Click Forgot your password? - 4. Click Send password reset link - 5. Check your email and click Reset password from the email - 6. Enter your new password, confirm the password and click Update password - 7. Click Back to sign in - 8. Sign in normally with your new password - +Forgot password +If you forgot your password, follow these steps for self-service recovery: +1. Go to https://console.clickhouse.cloud +2. Enter your email address and click Continue +3. Click Forgot your password? +4. Click Send password reset link +5. Check your email and click Reset password from the email +6. Enter your new password, confirm the password and click Update password +7. Click Back to sign in +8. Sign in normally with your new password
- Lost MFA device or token - - If you lost your MFA device or deleted your token, follow these steps to recover and create a new token: - 1. Go to https://console.clickhouse.cloud - 2. Enter your credentials and click Continue - 3. On the Multi-factor authentication screen click Cancel - 4. Click Recovery code - 5. Enter the code and press Continue - 6. Copy the new recovery code and store it somewhere safe - 7. Click the box next to `I have safely recorded this code` and click continue - 8. Once signed in, go to your profile in the upper left - 9. Click on security in the upper left - 10. Click the trash can icon next to Authenticator app to remove your old authenticator - 11. Click Remove authenticator app - 12. When prompted for your Multi-factor authentication, click Cancel - 13. Click Recovery code - 14. Enter your recovery code (this is the new code generated in step 7) and click Continue - 15. Copy the new recovery code and store it somewhere safe - this is a fail safe in case you leave the screen during the removal process - 16. Click the box next to `I have safely recorded this code` and click Continue - 17. Follow the process above to set up a new MFA factor - +Lost MFA device or token +If you lost your MFA device or deleted your token, follow these steps to recover and create a new token: +1. Go to https://console.clickhouse.cloud +2. Enter your credentials and click Continue +3. On the Multi-factor authentication screen click Cancel +4. Click Recovery code +5. Enter the code and press Continue +6. Copy the new recovery code and store it somewhere safe +7. Click the box next to `I have safely recorded this code` and click continue +8. Once signed in, go to your profile in the upper left +9. Click on security in the upper left +10. Click the trash can icon next to Authenticator app to remove your old authenticator +11. Click Remove authenticator app +12. When prompted for your Multi-factor authentication, click Cancel +13. Click Recovery code +14. Enter your recovery code (this is the new code generated in step 7) and click Continue +15. Copy the new recovery code and store it somewhere safe - this is a fail safe in case you leave the screen during the removal process +16. Click the box next to `I have safely recorded this code` and click Continue +17. Follow the process above to set up a new MFA factor
- Lost MFA and recovery code - - If you lost your MFA device AND recovery code or you lost your MFA device and never obtained a recovery code, follow these steps to request a reset: - - **Submit a ticket**: If you are in an organization that has other administrative users, even if you are attempting to access a single user organization, ask a member of your organization assigned the Admin role to log into the organization and submit a support ticket to reset your MFA on your behalf. Once we verify the request is authenticated, we will reset your MFA and notify the Admin. Sign in as usual without MFA and go to your profile settings to enroll a new factor if you wish. - - **Reset via email**: If you are the only user in the organization, submit a support case via email (support@clickhouse.com) using the email address associated with your account. Once we verify the request is coming from the correct email, we will reset your MFA AND password. Access your email to access the password reset link. Set up a new password then go to your profile settings to enroll a new factor if you wish. - +Lost MFA and recovery code +If you lost your MFA device AND recovery code or you lost your MFA device and never obtained a recovery code, follow these steps to request a reset: +**Submit a ticket**: If you are in an organization that has other administrative users, even if you are attempting to access a single user organization, ask a member of your organization assigned the Admin role to log into the organization and submit a support ticket to reset your MFA on your behalf. Once we verify the request is authenticated, we will reset your MFA and notify the Admin. Sign in as usual without MFA and go to your profile settings to enroll a new factor if you wish. +**Reset via email**: If you are the only user in the organization, submit a support case via email (support@clickhouse.com) using the email address associated with your account. Once we verify the request is coming from the correct email, we will reset your MFA AND password. Access your email to access the password reset link. Set up a new password then go to your profile settings to enroll a new factor if you wish.
## SAML SSO {#saml-sso} diff --git a/docs/cloud/security/cmek.md b/docs/cloud/security/cmek.md index 1d9651cb53f..8e8de42e947 100644 --- a/docs/cloud/security/cmek.md +++ b/docs/cloud/security/cmek.md @@ -30,14 +30,13 @@ TDE must be enabled on service creation. Existing services cannot be encrypted a ## Customer Managed Encryption Keys (CMEK) {#customer-managed-encryption-keys-cmek} :::warning -Deleting a KMS key used to encrypt a ClickHouse Cloud service will cause your ClickHouse service to be stopped and its data will be unretrievable, along with existing backups. To prevent accidental data loss when rotating keys you may wish to maintain old KMS keys for a period of time prior to deletion. +Deleting a KMS key used to encrypt a ClickHouse Cloud service will cause your ClickHouse service to be stopped and its data will be unretrievable, along with existing backups. To prevent accidental data loss when rotating keys you may wish to maintain old KMS keys for a period of time prior to deletion. ::: Once a service is encrypted with TDE, customers may update the key to enable CMEK. The service will automatically restart after updating the TDE setting. During this process, the old KMS key decrypts the data encrypting key (DEK), and the new KMS key re-encrypts the DEK. This ensures that the service on restart will use the new KMS key for encryption operations moving forward. This process may take several minutes.
- Enable CMEK with AWS KMS - +Enable CMEK with AWS KMS 1. In ClickHouse Cloud, select the encrypted service 2. Click on the Settings on the left 3. At the bottom of the screen, expand the Network security information @@ -67,12 +66,10 @@ Once a service is encrypted with TDE, customers may update the key to enable CME 11. Copy the Key ARN 12. Return to ClickHouse Cloud and paste the Key ARN in the Transparent Data Encryption section of the Service Settings 13. Save the change -
- Enable CMEK with GCP KMS - +Enable CMEK with GCP KMS 1. In ClickHouse Cloud, select the encrypted service 2. Click on the Settings on the left 3. At the bottom of the screen, expand the Network security information @@ -80,13 +77,12 @@ Once a service is encrypted with TDE, customers may update the key to enable CME 5. [Create a KMS key for GCP](https://cloud.google.com/kms/docs/create-key) 6. Click the key 7. Grant the following permissions to the GCP Encryption Service Account copied in step 4 above. - - Cloud KMS CryptoKey Encrypter/Decrypter - - Cloud KMS Viewer +- Cloud KMS CryptoKey Encrypter/Decrypter +- Cloud KMS Viewer 10. Save the Key permission 11. Copy the Key Resource Path 12. Return to ClickHouse Cloud and paste the Key Resource Path in the Transparent Data Encryption section of the Service Settings 13. Save the change -
## Key rotation {#key-rotation} diff --git a/docs/cloud/security/common-access-management-queries.md b/docs/cloud/security/common-access-management-queries.md index 24b98073491..cc55abc6014 100644 --- a/docs/cloud/security/common-access-management-queries.md +++ b/docs/cloud/security/common-access-management-queries.md @@ -38,13 +38,13 @@ These automatically generated SQL Console users have the `default` role. ## Passwordless authentication {#passwordless-authentication} -There are two roles available for SQL console: `sql_console_admin` with identical permissions to `default_role` and `sql_console_read_only` with read-only permissions. +There are two roles available for SQL console: `sql_console_admin` with identical permissions to `default_role` and `sql_console_read_only` with read-only permissions. Admin users are assigned the `sql_console_admin` role by default, so nothing changes for them. However, the `sql_console_read_only` role allows non-admin users to be granted read-only or full access to any instance. An admin needs to configure this access. The roles can be adjusted using the `GRANT` or `REVOKE` commands to better fit instance-specific requirements, and any modifications made to these roles will be persisted. ### Granular access control {#granular-access-control} -This access control functionality can also be configured manually for user-level granularity. Before assigning the new `sql_console_*` roles to users, SQL console user-specific database roles matching the namespace `sql-console-role:` should be created. For example: +This access control functionality can also be configured manually for user-level granularity. Before assigning the new `sql_console_*` roles to users, SQL console user-specific database roles matching the namespace `sql-console-role:` should be created. For example: ```sql CREATE ROLE OR REPLACE sql-console-role:; diff --git a/docs/cloud/security/compliance-overview.md b/docs/cloud/security/compliance-overview.md index 4653c0f09c1..f51635fee24 100644 --- a/docs/cloud/security/compliance-overview.md +++ b/docs/cloud/security/compliance-overview.md @@ -13,11 +13,11 @@ ClickHouse Cloud evaluates the security and compliance needs of our customers an ### SOC 2 Type II (since 2022) {#soc-2-type-ii-since-2022} -System and Organization Controls (SOC) 2 is a report focusing on security, availability, confidentiality, processing integrity and privacy criteria contained in the Trust Services Criteria (TSC) as applied to an organization's systems and is designed to provide assurance about these controls to relying parties (our customers). ClickHouse works with independent external auditors to undergo an audit at least once per year addressing security, availability and processing integrity of our systems and confidentiality and privacy of the data processed by our systems. The report addresses both our ClickHouse Cloud and Bring Your Own Cloud (BYOC) offerings. +System and Organization Controls (SOC) 2 is a report focusing on security, availability, confidentiality, processing integrity and privacy criteria contained in the Trust Services Criteria (TSC) as applied to an organization's systems and is designed to provide assurance about these controls to relying parties (our customers). ClickHouse works with independent external auditors to undergo an audit at least once per year addressing security, availability and processing integrity of our systems and confidentiality and privacy of the data processed by our systems. The report addresses both our ClickHouse Cloud and Bring Your Own Cloud (BYOC) offerings. ### ISO 27001 (Since 2023) {#iso-27001-since-2023} -International Standards Organization (ISO) 27001 is an international standard for information security. It requires companies to implement an Information Security Management System (ISMS) that includes processes for managing risks, creating and communicating policies, implementing security controls, and monitoring to ensure components remain relevant and effective. ClickHouse conducts internal audits and works with independent external auditors to undergo audits and interim inspections for the 2 years between certificate issuance. +International Standards Organization (ISO) 27001 is an international standard for information security. It requires companies to implement an Information Security Management System (ISMS) that includes processes for managing risks, creating and communicating policies, implementing security controls, and monitoring to ensure components remain relevant and effective. ClickHouse conducts internal audits and works with independent external auditors to undergo audits and interim inspections for the 2 years between certificate issuance. ### U.S. DPF (since 2024) {#us-dpf-since-2024} @@ -53,7 +53,7 @@ In addition to the items above, ClickHouse maintains internal compliance program ### Processing locations {#processing-locations} - [Sub-Processors and Affiliates](https://clickhouse.com/legal/agreements/subprocessors) -- [Data Processing Locations](https://trust.clickhouse.com) +- [Data Processing Locations](https://trust.clickhouse.com) ### Additional procedures {#additional-procedures} @@ -62,4 +62,4 @@ In addition to the items above, ClickHouse maintains internal compliance program # Payment compliance -ClickHouse provides a secure method to pay by credit card that is compliant with [PCI SAQ A v4.0](https://www.pcisecuritystandards.org/document_library/). +ClickHouse provides a secure method to pay by credit card that is compliant with [PCI SAQ A v4.0](https://www.pcisecuritystandards.org/document_library/). diff --git a/docs/cloud/security/gcp-private-service-connect.md b/docs/cloud/security/gcp-private-service-connect.md index 1113b3346c9..d786ec08a67 100644 --- a/docs/cloud/security/gcp-private-service-connect.md +++ b/docs/cloud/security/gcp-private-service-connect.md @@ -33,22 +33,20 @@ Service producers publish their applications to consumers by creating Private Se By default, a ClickHouse service is not available over a Private Service connection even if the PSC connection is approved and established; you need explicitly add the PSC ID to the allow list on an instance level by completing [step](#add-endpoint-id-to-services-allow-list) below. ::: - **Important considerations for using Private Service Connect Global Access**: 1. Regions utilizing Global Access must belong to the same VPC. 1. Global Access must be explicitly enabled at the PSC level (refer to the screenshot below). 1. Ensure that your firewall settings do not block access to PSC from other regions. 1. Be aware that you may incur GCP inter-region data transfer charges. -Cross-region connectivity is not supported. The producer and consumer regions must be the same. However, you can connect from other regions within your VPC by enabling [Global Access](https://cloud.google.com/vpc/docs/about-accessing-vpc-hosted-services-endpoints#global-access) at the Private Service Connect (PSC) level. + Cross-region connectivity is not supported. The producer and consumer regions must be the same. However, you can connect from other regions within your VPC by enabling [Global Access](https://cloud.google.com/vpc/docs/about-accessing-vpc-hosted-services-endpoints#global-access) at the Private Service Connect (PSC) level. -**Please complete the following to enable GCP PSC**: + **Please complete the following to enable GCP PSC**: 1. Obtain GCP service attachment for Private Service Connect. 1. Create a service endpoint. 1. Add "Endpoint ID" to ClickHouse Cloud service. 1. Add "Endpoint ID" to ClickHouse service allow list. - ## Attention {#attention} ClickHouse attempts to group your services to reuse the same published [PSC endpoint](https://cloud.google.com/vpc/docs/private-service-connect) within the GCP region. However, this grouping is not guaranteed, especially if you spread your services across multiple ClickHouse organizations. If you already have PSC configured for other services in your ClickHouse organization, you can often skip most of the steps because of that grouping and proceed directly to the final step: [Add "Endpoint ID" to ClickHouse service allow list](#add-endpoint-id-to-services-allow-list). @@ -59,11 +57,11 @@ Find Terraform examples [here](https://github.com/ClickHouse/terraform-provider- :::note Code examples are provided below to show how to set up Private Service Connect within a ClickHouse Cloud service. In our examples below, we will use: - - GCP region: `us-central1` - - GCP project (customer GCP project): `my-gcp-project` - - GCP private IP address in customer GCP project: `10.128.0.2` - - GCP VPC in customer GCP project: `default` -::: +- GCP region: `us-central1` +- GCP project (customer GCP project): `my-gcp-project` +- GCP private IP address in customer GCP project: `10.128.0.2` +- GCP VPC in customer GCP project: `default` + ::: You'll need to retrieve information about your ClickHouse Cloud service. You can do this either via the ClickHouse Cloud console or the ClickHouse API. If you are going to use the ClickHouse API, please set the following environment variables before proceeding: @@ -87,8 +85,8 @@ jq ".result[] | select (.region==\"${REGION:?}\" and .provider==\"${PROVIDER:?}\ ``` :::note - - You can retrieve your Organization ID from ClickHouse console(Organization -> Organization Details). - - You can [create a new key](/cloud/manage/openapi) or use an existing one. +- You can retrieve your Organization ID from ClickHouse console(Organization -> Organization Details). +- You can [create a new key](/cloud/manage/openapi) or use an existing one. ::: ## Obtain GCP service attachment and DNS name for Private Service Connect {#obtain-gcp-service-attachment-and-dns-name-for-private-service-connect} @@ -120,9 +118,9 @@ Make a note of the `endpointServiceId` and `privateDnsHostname`. You'll use them ## Create service endpoint {#create-service-endpoint} :::important -This section covers ClickHouse-specific details for configuring ClickHouse via GCP PSC(Private Service Connect). GCP-specific steps are provided as a reference to guide you on where to look, but they may change over time without notice from the GCP cloud provider. Please consider GCP configuration based on your specific use case. +This section covers ClickHouse-specific details for configuring ClickHouse via GCP PSC(Private Service Connect). GCP-specific steps are provided as a reference to guide you on where to look, but they may change over time without notice from the GCP cloud provider. Please consider GCP configuration based on your specific use case. -Please note that ClickHouse is not responsible for configuring the required GCP PSC endpoints, DNS records. +Please note that ClickHouse is not responsible for configuring the required GCP PSC endpoints, DNS records. For any issues related to GCP configuration tasks, contact GCP Support directly. ::: @@ -147,15 +145,15 @@ Open the Private Service Connect creation dialog by clicking on the **Connect En - **Network/Subnetwork/IP address**: Choose the network you want to use for the connection. You will need to create an IP address or use an existing one for the Private Service Connect endpoint. In our example, we pre-created an address with the name **your-ip-address** and assigned IP address `10.128.0.2` - To make the endpoint available from any region, you can enable the **Enable global access** checkbox. -Enable Global Access for Private Service Connect + Enable Global Access for Private Service Connect -To create the PSC Endpoint, use the **ADD ENDPOINT** button. + To create the PSC Endpoint, use the **ADD ENDPOINT** button. -The **Status** column will change from **Pending** to **Accepted** once the connection is approved. + The **Status** column will change from **Pending** to **Accepted** once the connection is approved. -Copy PSC Connection ID + Copy PSC Connection ID -Copy ***PSC Connection ID***, we are going to use it as ***Endpoint ID*** in the next steps. + Copy ***PSC Connection ID***, we are going to use it as ***Endpoint ID*** in the next steps. #### Option 2: Using Terraform {#option-2-using-terraform} @@ -280,7 +278,6 @@ curl --silent --user "${KEY_ID:?}:${KEY_SECRET:?}" -X PATCH -H "Content-Type: ap You need to add an Endpoint ID to the allow-list for each instance that should be available using Private Service Connect. - ### Option 1: ClickHouse Cloud console {#option-1-clickhouse-cloud-console-2} In the ClickHouse Cloud console, open the service that you would like to connect via Private Service Connect, then navigate to **Settings**. Enter the `Endpoint ID` retrieved from the [Adding a Private Service Connection](#adding-a-private-service-connection) step. Click **Create endpoint**. @@ -335,7 +332,6 @@ curl --silent --user "${KEY_ID:?}:${KEY_SECRET:?}" -X PATCH -H "Content-Type: ap Each service with Private Link enabled has a public and private endpoint. In order to connect using Private Link, you need to use a private endpoint which will be `privateDnsHostname` taken from [Obtain GCP service attachment for Private Service Connect](#obtain-gcp-service-attachment-and-dns-name-for-private-service-connect). - ### Getting private DNS hostname {#getting-private-dns-hostname} #### Option 1: ClickHouse Cloud console {#option-1-clickhouse-cloud-console-3} @@ -436,4 +432,3 @@ To do this, configure your GCP VPC firewall rules to allow connections from Clic ## More information {#more-information} For more detailed information, visit [cloud.google.com/vpc/docs/configure-private-service-connect-services](https://cloud.google.com/vpc/docs/configure-private-service-connect-services). - diff --git a/docs/cloud/security/saml-sso-setup.md b/docs/cloud/security/saml-sso-setup.md index ea2ad2f2613..55a764be5a1 100644 --- a/docs/cloud/security/saml-sso-setup.md +++ b/docs/cloud/security/saml-sso-setup.md @@ -32,92 +32,56 @@ We recommend setting up a **direct link to your organization** in addition to yo ### Steps {#steps}
- Get your organization ID - - All setups require your organization ID. To obtain your organization ID: - - 1. Sign in to your [ClickHouse Cloud](https://console.clickhouse.cloud) organization. - - Organization ID - - 3. In the lower left corner, click on your organization name under **Organization**. - - 4. In the pop-up menu, select **Organization details**. - - 5. Make note of your **Organization ID** to use below. - + Get your organization ID +All setups require your organization ID. To obtain your organization ID: +1. Sign in to your [ClickHouse Cloud](https://console.clickhouse.cloud) organization. +Organization ID +3. In the lower left corner, click on your organization name under **Organization**. +4. In the pop-up menu, select **Organization details**. +5. Make note of your **Organization ID** to use below.
-
- Configure your SAML integration - - ClickHouse uses service provider-initiated SAML connections. This means you can log in via https://console.clickhouse.cloud or via a direct link. We do not currently support identity provider initiated connections. Basic SAML configurations include the following: - - - SSO URL or ACS URL: `https://auth.clickhouse.cloud/login/callback?connection={organizationid}` - - - Audience URI or Entity ID: `urn:auth0:ch-production:{organizationid}` - - - Application username: `email` - - - Attribute mapping: `email = user.email` - - - Direct link to access your organization: `https://console.clickhouse.cloud/?connection={organizationid}` - - - For specific configuration steps, refer to your specific identity provider below. - +
+ Configure your SAML integration +ClickHouse uses service provider-initiated SAML connections. This means you can log in via https://console.clickhouse.cloud or via a direct link. We do not currently support identity provider initiated connections. Basic SAML configurations include the following: +- SSO URL or ACS URL: `https://auth.clickhouse.cloud/login/callback?connection={organizationid}` +- Audience URI or Entity ID: `urn:auth0:ch-production:{organizationid}` +- Application username: `email` +- Attribute mapping: `email = user.email` +- Direct link to access your organization: `https://console.clickhouse.cloud/?connection={organizationid}` +For specific configuration steps, refer to your specific identity provider below.
- Obtain your connection information - - Obtain your Identity provider SSO URL and x.509 certificate. Refer to your specific identity provider below for instructions on how to retrieve this information. - + Obtain your connection information +Obtain your Identity provider SSO URL and x.509 certificate. Refer to your specific identity provider below for instructions on how to retrieve this information.
-
- Submit a support case - - 1. Return to the ClickHouse Cloud console. - - 2. Select **Help** on the left, then the Support submenu. - - 3. Click **New case**. - - 4. Enter the subject "SAML SSO Setup". - - 5. In the description, paste any links gathered from the instructions above and attach the certificate to the ticket. - - 6. Please also let us know which domains should be allowed for this connection (e.g. domain.com, domain.ai, etc.). - - 7. Create a new case. - - 8. We will complete the setup within ClickHouse Cloud and let you know when it's ready to test. - + Submit a support case +1. Return to the ClickHouse Cloud console. +2. Select **Help** on the left, then the Support submenu. +3. Click **New case**. +4. Enter the subject "SAML SSO Setup". +5. In the description, paste any links gathered from the instructions above and attach the certificate to the ticket. +6. Please also let us know which domains should be allowed for this connection (e.g. domain.com, domain.ai, etc.). +7. Create a new case. +8. We will complete the setup within ClickHouse Cloud and let you know when it's ready to test.
- Complete the setup - - 1. Assign user access within your Identity Provider. - - 2. Log in to ClickHouse via https://console.clickhouse.cloud OR the direct link you configured in 'Configure your SAML integration' above. Users are initially assigned the 'Member' role, which can log in to the organization and update personal settings. - - 3. Log out of the ClickHouse organization. - - 4. Log in with your original authentication method to assign the Admin role to your new SSO account. - - For email + password accounts, please use `https://console.clickhouse.cloud/?with=email`. - - For social logins, please click the appropriate button (**Continue with Google** or **Continue with Microsoft**) - + Complete the setup +1. Assign user access within your Identity Provider. +2. Log in to ClickHouse via https://console.clickhouse.cloud OR the direct link you configured in 'Configure your SAML integration' above. Users are initially assigned the 'Member' role, which can log in to the organization and update personal settings. +3. Log out of the ClickHouse organization. +4. Log in with your original authentication method to assign the Admin role to your new SSO account. +- For email + password accounts, please use `https://console.clickhouse.cloud/?with=email`. +- For social logins, please click the appropriate button (**Continue with Google** or **Continue with Microsoft**) :::note `email` in `?with=email` above is the literal parameter value, not a placeholder ::: - - 5. Log out with your original authentication method and log back in via https://console.clickhouse.cloud OR the direct link you configured in 'Configure your SAML integration' above. - - 6. Remove any non-SAML users to enforce SAML for the organization. Going forward users are assigned via your Identity Provider. - +5. Log out with your original authentication method and log back in via https://console.clickhouse.cloud OR the direct link you configured in 'Configure your SAML integration' above. +6. Remove any non-SAML users to enforce SAML for the organization. Going forward users are assigned via your Identity Provider.
### Configure Okta SAML {#configure-okta-saml} @@ -125,134 +89,86 @@ We recommend setting up a **direct link to your organization** in addition to yo You will configure two App Integrations in Okta for each ClickHouse organization: one SAML app and one bookmark to house your direct link.
- 1. Create a group to manage access - - 1. Log in to your Okta instance as an **Administrator**. - - 2. Select **Groups** on the left. - - 3. Click **Add group**. - - 4. Enter a name and description for the group. This group will be used to keep users consistent between the SAML app and its related bookmark app. - - 5. Click **Save**. - - 6. Click the name of the group that you created. - - 7. Click **Assign people** to assign users you would like to have access to this ClickHouse organization. - + 1. Create a group to manage access +1. Log in to your Okta instance as an **Administrator**. +2. Select **Groups** on the left. +3. Click **Add group**. +4. Enter a name and description for the group. This group will be used to keep users consistent between the SAML app and its related bookmark app. +5. Click **Save**. +6. Click the name of the group that you created. +7. Click **Assign people** to assign users you would like to have access to this ClickHouse organization.
- 2. Create a bookmark app to enable users to seamlessly log in - - 1. Select **Applications** on the left, then select the **Applications** subheading. - - 2. Click **Browse App Catalog**. - - 3. Search for and select **Bookmark App**. - - 4. Click **Add integration**. - - 5. Select a label for the app. - - 6. Enter the URL as `https://console.clickhouse.cloud/?connection={organizationid}` - - 7. Go to the **Assignments** tab and add the group you created above. - + 2. Create a bookmark app to enable users to seamlessly log in +1. Select **Applications** on the left, then select the **Applications** subheading. +2. Click **Browse App Catalog**. +3. Search for and select **Bookmark App**. +4. Click **Add integration**. +5. Select a label for the app. +6. Enter the URL as `https://console.clickhouse.cloud/?connection={organizationid}` +7. Go to the **Assignments** tab and add the group you created above.
- 3. Create a SAML app to enable the connection - - 1. Select **Applications** on the left, then select the **Applications** subheading. - - 2. Click **Create App Integration**. - - 3. Select SAML 2.0 and click Next. - - 4. Enter a name for your application and check the box next to **Do not display application icon to users** then click **Next**. - - 5. Use the following values to populate the SAML settings screen. - - | Field | Value | - |--------------------------------|-------| - | Single Sign On URL | `https://auth.clickhouse.cloud/login/callback?connection={organizationid}` | - | Audience URI (SP Entity ID) | `urn:auth0:ch-production:{organizationid}` | - | Default RelayState | Leave blank | - | Name ID format | Unspecified | - | Application username | Email | - | Update application username on | Create and update | - - 7. Enter the following Attribute Statement. - - | Name | Name format | Value | - |---------|---------------|------------| - | email | Basic | user.email | - - 9. Click **Next**. - - 10. Enter the requested information on the Feedback screen and click **Finish**. - - 11. Go to the **Assignments** tab and add the group you created above. - - 12. On the **Sign On** tab for your new app, click the **View SAML setup instructions** button. - - Okta SAML Setup Instructions - - 13. Gather these three items and go to Submit a Support Case above to complete the process. - - Identity Provider Single Sign-On URL - - Identity Provider Issuer - - X.509 Certificate - + 3. Create a SAML app to enable the connection +1. Select **Applications** on the left, then select the **Applications** subheading. +2. Click **Create App Integration**. +3. Select SAML 2.0 and click Next. +4. Enter a name for your application and check the box next to **Do not display application icon to users** then click **Next**. +5. Use the following values to populate the SAML settings screen. +| Field | Value | +|--------------------------------|-------| +| Single Sign On URL | `https://auth.clickhouse.cloud/login/callback?connection={organizationid}` | +| Audience URI (SP Entity ID) | `urn:auth0:ch-production:{organizationid}` | +| Default RelayState | Leave blank | +| Name ID format | Unspecified | +| Application username | Email | +| Update application username on | Create and update | +7. Enter the following Attribute Statement. +| Name | Name format | Value | +|---------|---------------|------------| +| email | Basic | user.email | +9. Click **Next**. +10. Enter the requested information on the Feedback screen and click **Finish**. +11. Go to the **Assignments** tab and add the group you created above. +12. On the **Sign On** tab for your new app, click the **View SAML setup instructions** button. +Okta SAML Setup Instructions +13. Gather these three items and go to Submit a Support Case above to complete the process. +- Identity Provider Single Sign-On URL +- Identity Provider Issuer +- X.509 Certificate
- ### Configure Google SAML {#configure-google-saml} You will configure one SAML app in Google for each organization and must provide your users the direct link (`https://console.clickhouse.cloud/?connection={organizationId}`) to bookmark if using multi-org SSO.
- Create a Google Web App - - 1. Go to your Google Admin console (admin.google.com). - - Google SAML App - - 2. Click **Apps**, then **Web and mobile apps** on the left. - - 3. Click **Add app** from the top menu, then select **Add custom SAML app**. - - 4. Enter a name for the app and click **Continue**. - - 5. Gather these two items and go to Submit a Support Case above to submit the information to us. NOTE: If you complete the setup before copying this data, click **DOWNLOAD METADATA** from the app's home screen to get the X.509 certificate. - - SSO URL - - X.509 Certificate - - 7. Enter the ACS URL and Entity ID below. - - | Field | Value | - |-----------|-------| - | ACS URL | `https://auth.clickhouse.cloud/login/callback?connection={organizationid}` | - | Entity ID | `urn:auth0:ch-production:{organizationid}` | - - 8. Check the box for **Signed response**. - - 9. Select **EMAIL** for the Name ID Format and leave the Name ID as **Basic Information > Primary email.** - - 10. Click **Continue**. - - 11. Enter the following Attribute mapping: - - | Field | Value | - |-------------------|---------------| - | Basic information | Primary email | - | App attributes | email | - - 13. Click **Finish**. - - 14. To enable the app click **OFF** for everyone and change the setting to **ON** for everyone. Access can also be limited to groups or organizational units by selecting options on the left side of the screen. - + Create a Google Web App +1. Go to your Google Admin console (admin.google.com). +Google SAML App +2. Click **Apps**, then **Web and mobile apps** on the left. +3. Click **Add app** from the top menu, then select **Add custom SAML app**. +4. Enter a name for the app and click **Continue**. +5. Gather these two items and go to Submit a Support Case above to submit the information to us. NOTE: If you complete the setup before copying this data, click **DOWNLOAD METADATA** from the app's home screen to get the X.509 certificate. +- SSO URL +- X.509 Certificate +7. Enter the ACS URL and Entity ID below. +| Field | Value | +|-----------|-------| +| ACS URL | `https://auth.clickhouse.cloud/login/callback?connection={organizationid}` | +| Entity ID | `urn:auth0:ch-production:{organizationid}` | +8. Check the box for **Signed response**. +9. Select **EMAIL** for the Name ID Format and leave the Name ID as **Basic Information > Primary email.** +10. Click **Continue**. +11. Enter the following Attribute mapping: +| Field | Value | +|-------------------|---------------| +| Basic information | Primary email | +| App attributes | email | +13. Click **Finish**. +14. To enable the app click **OFF** for everyone and change the setting to **ON** for everyone. Access can also be limited to groups or organizational units by selecting options on the left side of the screen.
### Configure Azure (Microsoft) SAML {#configure-azure-microsoft-saml} @@ -260,82 +176,57 @@ You will configure one SAML app in Google for each organization and must provide Azure (Microsoft) SAML may also be referred to as Azure Active Directory (AD) or Microsoft Entra.
- Create an Azure Enterprise Application - - You will set up one application integration with a separate sign-on URL for each organization. - - 1. Log on to the Microsoft Entra admin center. - - 2. Navigate to **Applications > Enterprise** applications on the left. - - 3. Click **New application** on the top menu. - - 4. Click **Create your own application** on the top menu. - - 5. Enter a name and select **Integrate any other application you don't find in the gallery (Non-gallery)**, then click **Create**. - - Azure Non-Gallery App - - 6. Click **Users and groups** on the left and assign users. - - 7. Click **Single sign-on** on the left. - - 8. Click **SAML**. - - 9. Use the following settings to populate the Basic SAML Configuration screen. - - | Field | Value | - |---------------------------|-------| - | Identifier (Entity ID) | `urn:auth0:ch-production:{organizationid}` | - | Reply URL (Assertion Consumer Service URL) | `https://auth.clickhouse.cloud/login/callback?connection={organizationid}` | - | Sign on URL | `https://console.clickhouse.cloud/?connection={organizationid}` | - | Relay State | Blank | - | Logout URL | Blank | - - 11. Add (A) or update (U) the following under Attributes & Claims: - - | Claim name | Format | Source attribute | - |--------------------------------------|---------------|------------------| - | (U) Unique User Identifier (Name ID) | Email address | user.mail | - | (A) email | Basic | user.mail | - | (U) /identity/claims/name | Omitted | user.mail | - - Attributes and Claims - - 12. Gather these two items and go to Submit a Support Case above to complete the process: - - Login URL - - Certificate (Base64) - + Create an Azure Enterprise Application +You will set up one application integration with a separate sign-on URL for each organization. +1. Log on to the Microsoft Entra admin center. +2. Navigate to **Applications > Enterprise** applications on the left. +3. Click **New application** on the top menu. +4. Click **Create your own application** on the top menu. +5. Enter a name and select **Integrate any other application you don't find in the gallery (Non-gallery)**, then click **Create**. +Azure Non-Gallery App +6. Click **Users and groups** on the left and assign users. +7. Click **Single sign-on** on the left. +8. Click **SAML**. +9. Use the following settings to populate the Basic SAML Configuration screen. +| Field | Value | +|---------------------------|-------| +| Identifier (Entity ID) | `urn:auth0:ch-production:{organizationid}` | +| Reply URL (Assertion Consumer Service URL) | `https://auth.clickhouse.cloud/login/callback?connection={organizationid}` | +| Sign on URL | `https://console.clickhouse.cloud/?connection={organizationid}` | +| Relay State | Blank | +| Logout URL | Blank | +11. Add (A) or update (U) the following under Attributes & Claims: +| Claim name | Format | Source attribute | +|--------------------------------------|---------------|------------------| +| (U) Unique User Identifier (Name ID) | Email address | user.mail | +| (A) email | Basic | user.mail | +| (U) /identity/claims/name | Omitted | user.mail | +Attributes and Claims +12. Gather these two items and go to Submit a Support Case above to complete the process: +- Login URL +- Certificate (Base64)
### Configure Duo SAML {#configure-duo-saml}
- Create a Generic SAML Service Provider for Duo - - 1. Follow the instructions for [Duo Single Sign-On for Generic SAML Service Providers](https://duo.com/docs/sso-generic). - - 2. Use the following Bridge Attribute mapping: - - | Bridge Attribute | ClickHouse Attribute | - |:-------------------|:-----------------------| - | Email Address | email | - - 3. Use the following values to update your Cloud Application in Duo: - - | Field | Value | - |:----------|:-------------------------------------------| - | Entity ID | `urn:auth0:ch-production:{organizationid}` | - | Assertion Consumer Service (ACS) URL | `https://auth.clickhouse.cloud/login/callback?connection={organizationid}` | - | Service Provider Login URL | `https://console.clickhouse.cloud/?connection={organizationid}` | - - 4. Gather these two items and go to Submit a Support Case above to complete the process: - - Single Sign-On URL - - Certificate - + Create a Generic SAML Service Provider for Duo +1. Follow the instructions for [Duo Single Sign-On for Generic SAML Service Providers](https://duo.com/docs/sso-generic). +2. Use the following Bridge Attribute mapping: +| Bridge Attribute | ClickHouse Attribute | +|:-------------------|:-----------------------| +| Email Address | email | +3. Use the following values to update your Cloud Application in Duo: +| Field | Value | +|:----------|:-------------------------------------------| +| Entity ID | `urn:auth0:ch-production:{organizationid}` | +| Assertion Consumer Service (ACS) URL | `https://auth.clickhouse.cloud/login/callback?connection={organizationid}` | +| Service Provider Login URL | `https://console.clickhouse.cloud/?connection={organizationid}` | +4. Gather these two items and go to Submit a Support Case above to complete the process: +- Single Sign-On URL +- Certificate
- ## How it works {#how-it-works} ### Service provider-initiated SSO {#service-provider-initiated-sso} @@ -369,4 +260,3 @@ Security is our top priority when it comes to authentication. For this reason, w - **All users assigned to your app via your IdP must have the same email domain.** If you have vendors, contractors or consultants you would like to have access to your ClickHouse account, they must have an email address with the same domain (e.g. user@domain.com) as your employees. - **We do not automatically link SSO and non-SSO accounts.** You may see multiple accounts for your users in your ClickHouse user list even if they are using the same email address. - diff --git a/docs/cloud/security/setting-ip-filters.md b/docs/cloud/security/setting-ip-filters.md index 3d8586ef11c..3d953b84b78 100644 --- a/docs/cloud/security/setting-ip-filters.md +++ b/docs/cloud/security/setting-ip-filters.md @@ -25,57 +25,48 @@ Classless Inter-domain Routing (CIDR) notation, allows you to specify IP address ## Create or modify an IP access list {#create-or-modify-an-ip-access-list}
- IP access list for ClickHouse services - - When you create a ClickHouse service, the default setting for the IP allow list is 'Allow from nowhere.' - - From your ClickHouse Cloud services list select the service and then select **Settings**. Under the **Security** section, you will find the IP access list. Click on the Add IPs button. - - A sidebar will appear with options for you to configure: - - - Allow incoming traffic from anywhere to the service - - Allow access from specific locations to the service - - Deny all access to the service - +IP access list for ClickHouse services +When you create a ClickHouse service, the default setting for the IP allow list is 'Allow from nowhere.' +From your ClickHouse Cloud services list select the service and then select **Settings**. Under the **Security** section, you will find the IP access list. Click on the Add IPs button. +A sidebar will appear with options for you to configure: +- Allow incoming traffic from anywhere to the service +- Allow access from specific locations to the service +- Deny all access to the service
- IP access list for API keys - - When you create an API key, the default setting for the IP allow list is 'Allow from anywhere.' - - From the API key list, click the three dots next to the API key under the **Actions** column and select **Edit**. At the bottom of the screen you will find the IP access list and options to configure: - - - Allow incoming traffic from anywhere to the service - - Allow access from specific locations to the service - - Deny all access to the service - +IP access list for API keys +When you create an API key, the default setting for the IP allow list is 'Allow from anywhere.' +From the API key list, click the three dots next to the API key under the **Actions** column and select **Edit**. At the bottom of the screen you will find the IP access list and options to configure: +- Allow incoming traffic from anywhere to the service +- Allow access from specific locations to the service +- Deny all access to the service
This screenshot shows an access list which allows traffic from a range of IP addresses, described as "NY Office range": - + Existing access list in ClickHouse Cloud ### Possible actions {#possible-actions} 1. To add an additional entry you can use **+ Add new IP** - This example adds a single IP address, with a description of `London server`: + This example adds a single IP address, with a description of `London server`: -Adding a single IP to the access list in ClickHouse Cloud + Adding a single IP to the access list in ClickHouse Cloud 1. Delete an existing entry - Clicking the cross (x) can deletes an entry + Clicking the cross (x) can deletes an entry 1. Edit an existing entry - Directly modifying the entry + Directly modifying the entry 1. Switch to allow access from **Anywhere** - This is not recommended, but it is allowed. We recommend that you expose an application built on top of ClickHouse to the public and restrict access to the back-end ClickHouse Cloud service. + This is not recommended, but it is allowed. We recommend that you expose an application built on top of ClickHouse to the public and restrict access to the back-end ClickHouse Cloud service. -To apply the changes you made, you must click **Save**. + To apply the changes you made, you must click **Save**. ## Verification {#verification} diff --git a/docs/cloud/security/shared-responsibility-model.md b/docs/cloud/security/shared-responsibility-model.md index 6249c99bbb4..44f29d83702 100644 --- a/docs/cloud/security/shared-responsibility-model.md +++ b/docs/cloud/security/shared-responsibility-model.md @@ -17,7 +17,6 @@ The Cloud architecture consists of the control plane and the data plane. The con Bring your own cloud (BYOC) enables customers to run the data plane in their own cloud account. For more information, review our [(BYOC) Bring Your Own Cloud](/cloud/reference/byoc) page. - ## ClickHouse Cloud shared responsibility model {#clickhouse-cloud-shared-responsibility-model} The model below generally addresses ClickHouse responsibilities and shows responsibilities that should be addressed by customers of ClickHouse Cloud and ClickHouse BYOC, respectively. For more information on our PCI shared responsibility model, please download a copy of the overview available in our [Trust Center](https://trust.clickhouse.com). @@ -43,63 +42,51 @@ The model below generally addresses ClickHouse responsibilities and shows respon ## ClickHouse Cloud configurable security features {#clickhouse-cloud-configurable-security-features}
- Network connectivity - - | Setting | Status | Cloud | Service level | - |------------------------------------------------------------------------------------------------------|-----------|-------------------|----------------------| - | [IP filters](/cloud/security/setting-ip-filters) to restrict connections to services | Available | AWS, GCP, Azure | All | - | [Private link](/cloud/security/private-link-overview) to securely connect to services | Available | AWS, GCP, Azure | Scale or Enterprise | - +Network connectivity +| Setting | Status | Cloud | Service level | +|------------------------------------------------------------------------------------------------------|-----------|-------------------|----------------------| +| [IP filters](/cloud/security/setting-ip-filters) to restrict connections to services | Available | AWS, GCP, Azure | All | +| [Private link](/cloud/security/private-link-overview) to securely connect to services | Available | AWS, GCP, Azure | Scale or Enterprise |
- Access management - - - | Setting | Status | Cloud | Service level | - |------------------------------------------------------------------------------------------------------|-----------|-------------------|-------------------------| - | [Standard role-based access](/cloud/security/cloud-access-management) in control plane | Available | AWS, GCP, Azure | All | - | [Multi-factor authentication (MFA)](/cloud/security/cloud-authentication#multi-factor-authentication) available | Available | AWS, GCP, Azure | All | - | [SAML Single Sign-On](/cloud/security/saml-setup) to control plane available | Preview | AWS, GCP, Azure | Enterprise | - | Granular [role-based access control](/cloud/security/cloud-access-management/overview#database-permissions) in databases | Available | AWS, GCP, Azure | All | - +Access management +| Setting | Status | Cloud | Service level | +|------------------------------------------------------------------------------------------------------|-----------|-------------------|-------------------------| +| [Standard role-based access](/cloud/security/cloud-access-management) in control plane | Available | AWS, GCP, Azure | All | +| [Multi-factor authentication (MFA)](/cloud/security/cloud-authentication#multi-factor-authentication) available | Available | AWS, GCP, Azure | All | +| [SAML Single Sign-On](/cloud/security/saml-setup) to control plane available | Preview | AWS, GCP, Azure | Enterprise | +| Granular [role-based access control](/cloud/security/cloud-access-management/overview#database-permissions) in databases | Available | AWS, GCP, Azure | All |
- Data security - - | Setting | Status | Cloud | Service level | - |------------------------------------------------------------------------------------------------------|-----------|-------------------|-------------------------| - | [Cloud provider and region](/cloud/reference/supported-regions) selections | Available | AWS, GCP, Azure | All | - | Limited [free daily backups](/cloud/manage/backups/overview#default-backup-policy) | Available | AWS, GCP, Azure | All | - | [Custom backup configurations](/cloud/manage/backups/overview#configurable-backups) available | Available | GCP, AWS, Azure | Scale or Enterprise | - | [Customer managed encryption keys (CMEK)](/cloud/security/cmek) for transparent
data encryption available | Available | AWS, GCP | Enterprise | - | [Field level encryption](/sql-reference/functions/encryption-functions) with manual key management for granular encryption | Available | GCP, AWS, Azure | All | - - +Data security +| Setting | Status | Cloud | Service level | +|------------------------------------------------------------------------------------------------------|-----------|-------------------|-------------------------| +| [Cloud provider and region](/cloud/reference/supported-regions) selections | Available | AWS, GCP, Azure | All | +| Limited [free daily backups](/cloud/manage/backups/overview#default-backup-policy) | Available | AWS, GCP, Azure | All | +| [Custom backup configurations](/cloud/manage/backups/overview#configurable-backups) available | Available | GCP, AWS, Azure | Scale or Enterprise | +| [Customer managed encryption keys (CMEK)](/cloud/security/cmek) for transparent
data encryption available | Available | AWS, GCP | Enterprise | +| [Field level encryption](/sql-reference/functions/encryption-functions) with manual key management for granular encryption | Available | GCP, AWS, Azure | All |
- Data retention - - | Setting | Status | Cloud | Service level | - |------------------------------------------------------------------------------------------------------|-----------|-------------------|-------------------------| - | [Time to live (TTL)](/sql-reference/statements/alter/ttl) settings to manage retention | Available | AWS, GCP, Azure | All | - | [ALTER TABLE DELETE](/sql-reference/statements/alter/delete) for heavy deletion actions | Available | AWS, GCP, Azure | All | - | [Lightweight DELETE](/sql-reference/statements/delete) for measured deletion activities | Available | AWS, GCP, Azure | All | - +Data retention +| Setting | Status | Cloud | Service level | +|------------------------------------------------------------------------------------------------------|-----------|-------------------|-------------------------| +| [Time to live (TTL)](/sql-reference/statements/alter/ttl) settings to manage retention | Available | AWS, GCP, Azure | All | +| [ALTER TABLE DELETE](/sql-reference/statements/alter/delete) for heavy deletion actions | Available | AWS, GCP, Azure | All | +| [Lightweight DELETE](/sql-reference/statements/delete) for measured deletion activities | Available | AWS, GCP, Azure | All |
- Auditing and logging - - | Setting | Status | Cloud | Service level | - |------------------------------------------------------------------------------------------------------|-----------|-------------------|-------------------------| - | [Audit log](/cloud/security/audit-logging) for control plane activities | Available | AWS, GCP, Azure | All | - | [Session log](/operations/system-tables/session_log) for database activities | Available | AWS, GCP, Azure | All | - | [Query log](/operations/system-tables/query_log) for database activities | Available | AWS, GCP, Azure | All | - +Auditing and logging +| Setting | Status | Cloud | Service level | +|------------------------------------------------------------------------------------------------------|-----------|-------------------|-------------------------| +| [Audit log](/cloud/security/audit-logging) for control plane activities | Available | AWS, GCP, Azure | All | +| [Session log](/operations/system-tables/session_log) for database activities | Available | AWS, GCP, Azure | All | +| [Query log](/operations/system-tables/query_log) for database activities | Available | AWS, GCP, Azure | All |
## ClickHouse Cloud compliance {#clickhouse-cloud-compliance} - | Framework | Status | Cloud | Service level | + | Framework | Status | Cloud | Service level | |------------------------------------------------------------------------------------------------------|-----------|-------------------|-------------------------| | ISO 27001 compliance | Available | AWS, GCP, Azure | All | | SOC 2 Type II compliance | Available | AWS, GCP, Azure | All | @@ -108,4 +95,3 @@ The model below generally addresses ClickHouse responsibilities and shows respon | PCI compliance | Available | AWS | Enterprise | For more information on supported compliance frameworks, please review our [Security and Compliance](/cloud/security/security-and-compliance) page. - diff --git a/docs/concepts/glossary.md b/docs/concepts/glossary.md index 4b36ca9f57f..b0cab28be5a 100644 --- a/docs/concepts/glossary.md +++ b/docs/concepts/glossary.md @@ -9,7 +9,7 @@ slug: /concepts/glossary ## Atomicity {#atomicity} -Atomicity ensures that a transaction (a series of database operations) is treated as a single, indivisible unit. This means that either all operations within the transaction occur, or none do. An example of an atomic transaction is transferring money from one bank account to another. If either step of the transfer fails, the transaction fails, and the money stays in the first account. Atomicity ensures no money is lost or created. +Atomicity ensures that a transaction (a series of database operations) is treated as a single, indivisible unit. This means that either all operations within the transaction occur, or none do. An example of an atomic transaction is transferring money from one bank account to another. If either step of the transfer fails, the transaction fails, and the money stays in the first account. Atomicity ensures no money is lost or created. ## Cluster {#cluster} @@ -17,7 +17,7 @@ A collection of nodes (servers) that work together to store and process data. ## CMEK {#cmek} -Customer-managed encryption keys (CMEK) allow customers to use their key-management service (KMS) key to encrypt the ClickHouse disk data key and protect their data at rest. +Customer-managed encryption keys (CMEK) allow customers to use their key-management service (KMS) key to encrypt the ClickHouse disk data key and protect their data at rest. ## Dictionary {#dictionary} @@ -33,4 +33,4 @@ A copy of the data stored in a ClickHouse database. You can have any number of r ## Shard {#shard} -A subset of data. ClickHouse always has at least one shard for your data. If you do not split the data across multiple servers, your data will be stored in one shard. Sharding data across multiple servers can be used to divide the load if you exceed the capacity of a single server. +A subset of data. ClickHouse always has at least one shard for your data. If you do not split the data across multiple servers, your data will be stored in one shard. Sharding data across multiple servers can be used to divide the load if you exceed the capacity of a single server. diff --git a/docs/concepts/index.md b/docs/concepts/index.md index 10a67c51f50..3fb750c86d8 100644 --- a/docs/concepts/index.md +++ b/docs/concepts/index.md @@ -7,12 +7,12 @@ pagination_prev: null keywords: ['concepts', 'OLAP', 'fast'] --- -In this section of the docs we'll dive into the concepts around what makes ClickHouse so fast and efficient. +In this section of the docs we'll dive into the concepts around what makes ClickHouse so fast and efficient. | Page | Description | |------------------------------------------------------------------|---------------------------------------------------------------------------------------| -| [Why is ClickHouse so Fast?](./why-clickhouse-is-so-fast.md) | Learn what makes ClickHouse so fast. -| [What is OLAP?](./olap.md) | Learn what Online Analytical Processing is. -| [Why is ClickHouse unique?](../about-us/distinctive-features.md) | Learn what makes ClickHouse unique. -| [Glossary](./glossary.md) | This page contains a glossary of terms you'll commonly encounter throughout the docs. -| [FAQ](../faq/index.md) | A compilation of the most frequently asked questions we get about ClickHouse. +| [Why is ClickHouse so Fast?](./why-clickhouse-is-so-fast.md) | Learn what makes ClickHouse so fast. +| [What is OLAP?](./olap.md) | Learn what Online Analytical Processing is. +| [Why is ClickHouse unique?](../about-us/distinctive-features.md) | Learn what makes ClickHouse unique. +| [Glossary](./glossary.md) | This page contains a glossary of terms you'll commonly encounter throughout the docs. +| [FAQ](../faq/index.md) | A compilation of the most frequently asked questions we get about ClickHouse. diff --git a/docs/concepts/olap.md b/docs/concepts/olap.md index 253bd85310b..3a2a43b81ca 100644 --- a/docs/concepts/olap.md +++ b/docs/concepts/olap.md @@ -11,14 +11,12 @@ keywords: ['OLAP'] [OLAP](https://en.wikipedia.org/wiki/Online_analytical_processing) stands for Online Analytical Processing. It is a broad term that can be looked at from two perspectives: technical and business. At the highest level, you can just read these words backward: - **Processing** some source data is processed… **Analytical** …to produce some analytical reports and insights… **Online** …in real-time. - ## OLAP from the business perspective {#olap-from-the-business-perspective} In recent years business people started to realize the value of data. Companies who make their decisions blindly more often than not fail to keep up with the competition. The data-driven approach of successful companies forces them to collect all data that might be even remotely useful for making business decisions, and imposes on them a need for mechanisms which allow them to analyze this data in a timely manner. Here's where OLAP database management systems (DBMS) come in. diff --git a/docs/concepts/why-clickhouse-is-so-fast.md b/docs/concepts/why-clickhouse-is-so-fast.md index d3fb45a5bfe..04db3016420 100644 --- a/docs/concepts/why-clickhouse-is-so-fast.md +++ b/docs/concepts/why-clickhouse-is-so-fast.md @@ -49,13 +49,13 @@ Unlike other databases, ClickHouse keeps data writes lightweight and efficient b - **TTL (time-to-live) merges** compress, move, or delete rows based on certain time-based rules. -The point of these transformations is to shift work (computation) from the time user queries run to merge time. This is important for two reasons: + The point of these transformations is to shift work (computation) from the time user queries run to merge time. This is important for two reasons: -On the one hand, user queries may become significantly faster, sometimes by 1000x or more, if they can leverage "transformed" data, e.g. pre-aggregated data. + On the one hand, user queries may become significantly faster, sometimes by 1000x or more, if they can leverage "transformed" data, e.g. pre-aggregated data. -On the other hand, the majority of the runtime of merges is consumed by loading the input parts and saving the output part. The additional effort to transform the data during merge does usually not impact the runtime of merges too much. All of this magic is completely transparent and does not affect the result of queries (besides their performance). + On the other hand, the majority of the runtime of merges is consumed by loading the input parts and saving the output part. The additional effort to transform the data during merge does usually not impact the runtime of merges too much. All of this magic is completely transparent and does not affect the result of queries (besides their performance). -🤿 Deep dive into this in the [Merge-time Data Transformation](/docs/academic_overview#3-3-merge-time-data-transformation) section of the web version of our VLDB 2024 paper. + 🤿 Deep dive into this in the [Merge-time Data Transformation](/docs/academic_overview#3-3-merge-time-data-transformation) section of the web version of our VLDB 2024 paper. ## Storage layer: data pruning {#storage-layer-data-pruning} @@ -69,9 +69,9 @@ In practice, many queries are repetitive, i.e., run unchanged or only with sligh 3. [Skipping indexes](/optimize/skipping-indexes) that embed additional data statistics into columns, e.g. the minimum and maximum column value, the set of unique values, etc. Skipping indexes are orthogonal to primary keys and table projections, and depending on the data distribution in the column, they can greatly speed up the evaluation of filters. -All three techniques aim to skip as many rows during full-column reads as possible because the fastest way to read data is to not read it at all. + All three techniques aim to skip as many rows during full-column reads as possible because the fastest way to read data is to not read it at all. -🤿 Deep dive into this in the [Data Pruning](/docs/academic_overview#3-2-data-pruning) section of the web version of our VLDB 2024 paper. + 🤿 Deep dive into this in the [Data Pruning](/docs/academic_overview#3-2-data-pruning) section of the web version of our VLDB 2024 paper. ## Storage layer: data compression {#storage-layer-data-compression} @@ -109,7 +109,6 @@ If a single node becomes too small to hold the table data, further nodes can be What sets ClickHouse [apart](https://www.youtube.com/watch?v=CAS2otEoerM) is its meticulous attention to low-level optimization. Building a database that simply works is one thing, but engineering it to deliver speed across diverse query types, data structures, distributions, and index configurations is where the "[freak system](https://youtu.be/Vy2t_wZx4Is?si=K7MyzsBBxgmGcuGU&t=3579)" artistry shines. - **Hash Tables.** Let's take a hash table as an example. Hash tables are central data structures used by joins and aggregations. As a programmer, one needs to consider these design decisions: * The hash function to choose, @@ -118,20 +117,20 @@ What sets ClickHouse [apart](https://www.youtube.com/watch?v=CAS2otEoerM) is its * The fill factor: When and how to resize? How to move values during resize? * Deletions: Should the hash table allow evicting entries? -A standard hash table provided by a third-party library would functionally work, but it would not be fast. Great performance requires meticulous benchmarking and experimentation. + A standard hash table provided by a third-party library would functionally work, but it would not be fast. Great performance requires meticulous benchmarking and experimentation. -The [hash table implementation in ClickHouse](https://clickhouse.com/blog/hash-tables-in-clickhouse-and-zero-cost-abstractions) chooses one of **30+ precompiled hash table variants based** on the specifics of the query and the data. + The [hash table implementation in ClickHouse](https://clickhouse.com/blog/hash-tables-in-clickhouse-and-zero-cost-abstractions) chooses one of **30+ precompiled hash table variants based** on the specifics of the query and the data. -**Algorithms.** The same goes for algorithms. For example, in sorting, you might consider: + **Algorithms.** The same goes for algorithms. For example, in sorting, you might consider: * What will be sorted: numbers, tuples, strings, or structures? * Is the data in RAM? * Is the sort required to be stable? * Should all data be sorted or will a partial sort suffice? -Algorithms that rely on data characteristics often perform better than their generic counterparts. If the data characteristics are not known in advance, the system can try various implementations and choose the one that works best at runtime. For an example, see the [article on how LZ4 decompression is implemented in ClickHouse](https://habr.com/en/company/yandex/blog/457612/). + Algorithms that rely on data characteristics often perform better than their generic counterparts. If the data characteristics are not known in advance, the system can try various implementations and choose the one that works best at runtime. For an example, see the [article on how LZ4 decompression is implemented in ClickHouse](https://habr.com/en/company/yandex/blog/457612/). -🤿 Deep dive into this in the [Holistic Performance Optimization](/academic_overview#4-4-holistic-performance-optimization) section of the web version of our VLDB 2024 paper. + 🤿 Deep dive into this in the [Holistic Performance Optimization](/academic_overview#4-4-holistic-performance-optimization) section of the web version of our VLDB 2024 paper. ## VLDB 2024 paper {#vldb-2024-paper} diff --git a/docs/data-compression/compression-in-clickhouse.md b/docs/data-compression/compression-in-clickhouse.md index accecf43e45..348fa662e3a 100644 --- a/docs/data-compression/compression-in-clickhouse.md +++ b/docs/data-compression/compression-in-clickhouse.md @@ -5,7 +5,7 @@ description: 'Choosing ClickHouse compression algorithms' keywords: ['compression', 'codec', 'encoding'] --- -One of the secrets to ClickHouse query performance is compression. +One of the secrets to ClickHouse query performance is compression. Less data on disk means less I/O and faster queries and inserts. The overhead of any compression algorithm with respect to CPU is in most cases outweighed by the reduction in IO. Improving the compression of the data should therefore be the first focus when working on ensuring ClickHouse queries are fast. @@ -16,7 +16,7 @@ Compression in ClickHouse will be impacted by 3 principal factors: - The data types - Which codecs are used -All of these are configured through the schema. + All of these are configured through the schema. ## Choose the right data type to optimize compression {#choose-the-right-data-type-to-optimize-compression} @@ -25,113 +25,113 @@ Let's use the Stack Overflow dataset as an example. Let's compare compression st - `posts` - A non type optimized schema with no ordering key. - `posts_v3` - A type optimized schema with the appropriate type and bit size for each column with ordering key `(PostTypeId, toDate(CreationDate), CommentCount)`. -Using the following queries, we can measure the current compressed and uncompressed size of each column. Let's examine the size of the initial optimized schema `posts` with no ordering key. + Using the following queries, we can measure the current compressed and uncompressed size of each column. Let's examine the size of the initial optimized schema `posts` with no ordering key. -```sql -SELECT name, - formatReadableSize(sum(data_compressed_bytes)) AS compressed_size, - formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size, - round(sum(data_uncompressed_bytes) / sum(data_compressed_bytes), 2) AS ratio -FROM system.columns -WHERE table = 'posts' -GROUP BY name - -┌─name──────────────────┬─compressed_size─┬─uncompressed_size─┬───ratio────┐ -│ Body │ 46.14 GiB │ 127.31 GiB │ 2.76 │ -│ Title │ 1.20 GiB │ 2.63 GiB │ 2.19 │ -│ Score │ 84.77 MiB │ 736.45 MiB │ 8.69 │ -│ Tags │ 475.56 MiB │ 1.40 GiB │ 3.02 │ -│ ParentId │ 210.91 MiB │ 696.20 MiB │ 3.3 │ -│ Id │ 111.17 MiB │ 736.45 MiB │ 6.62 │ -│ AcceptedAnswerId │ 81.55 MiB │ 736.45 MiB │ 9.03 │ -│ ClosedDate │ 13.99 MiB │ 517.82 MiB │ 37.02 │ -│ LastActivityDate │ 489.84 MiB │ 964.64 MiB │ 1.97 │ -│ CommentCount │ 37.62 MiB │ 565.30 MiB │ 15.03 │ -│ OwnerUserId │ 368.98 MiB │ 736.45 MiB │ 2 │ -│ AnswerCount │ 21.82 MiB │ 622.35 MiB │ 28.53 │ -│ FavoriteCount │ 280.95 KiB │ 508.40 MiB │ 1853.02 │ -│ ViewCount │ 95.77 MiB │ 736.45 MiB │ 7.69 │ -│ LastEditorUserId │ 179.47 MiB │ 736.45 MiB │ 4.1 │ -│ ContentLicense │ 5.45 MiB │ 847.92 MiB │ 155.5 │ -│ OwnerDisplayName │ 14.30 MiB │ 142.58 MiB │ 9.97 │ -│ PostTypeId │ 20.93 MiB │ 565.30 MiB │ 27 │ -│ CreationDate │ 314.17 MiB │ 964.64 MiB │ 3.07 │ -│ LastEditDate │ 346.32 MiB │ 964.64 MiB │ 2.79 │ -│ LastEditorDisplayName │ 5.46 MiB │ 124.25 MiB │ 22.75 │ -│ CommunityOwnedDate │ 2.21 MiB │ 509.60 MiB │ 230.94 │ -└───────────────────────┴─────────────────┴───────────────────┴────────────┘ -``` - -We show both a compressed and uncompressed size here. Both are important. The compressed size equates to what we will need to read off disk - something we want to minimize for query performance (and storage cost). This data will need to be decompressed prior to reading. The size of this uncompressed size will be dependent on the data type used in this case. Minimizing this size will reduce memory overhead of queries and the amount of data which has to be processed by the query, improving utilization of caches and ultimately query times. - -> The above query relies on the table `columns` in the system database. This database is managed by ClickHouse and is a treasure trove of useful information, from query performance metrics to background cluster logs. We recommend ["System Tables and a Window into the Internals of ClickHouse"](https://clickhouse.com/blog/clickhouse-debugging-issues-with-system-tables) and accompanying articles[[1]](https://clickhouse.com/blog/monitoring-troubleshooting-insert-queries-clickhouse)[[2]](https://clickhouse.com/blog/monitoring-troubleshooting-select-queries-clickhouse) for the curious reader. - -To summarize the total size of the table, we can simplify the above query: - -```sql -SELECT formatReadableSize(sum(data_compressed_bytes)) AS compressed_size, + ```sql + SELECT name, + formatReadableSize(sum(data_compressed_bytes)) AS compressed_size, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size, round(sum(data_uncompressed_bytes) / sum(data_compressed_bytes), 2) AS ratio -FROM system.columns -WHERE table = 'posts' + FROM system.columns + WHERE table = 'posts' + GROUP BY name + + ┌─name──────────────────┬─compressed_size─┬─uncompressed_size─┬───ratio────┐ + │ Body │ 46.14 GiB │ 127.31 GiB │ 2.76 │ + │ Title │ 1.20 GiB │ 2.63 GiB │ 2.19 │ + │ Score │ 84.77 MiB │ 736.45 MiB │ 8.69 │ + │ Tags │ 475.56 MiB │ 1.40 GiB │ 3.02 │ + │ ParentId │ 210.91 MiB │ 696.20 MiB │ 3.3 │ + │ Id │ 111.17 MiB │ 736.45 MiB │ 6.62 │ + │ AcceptedAnswerId │ 81.55 MiB │ 736.45 MiB │ 9.03 │ + │ ClosedDate │ 13.99 MiB │ 517.82 MiB │ 37.02 │ + │ LastActivityDate │ 489.84 MiB │ 964.64 MiB │ 1.97 │ + │ CommentCount │ 37.62 MiB │ 565.30 MiB │ 15.03 │ + │ OwnerUserId │ 368.98 MiB │ 736.45 MiB │ 2 │ + │ AnswerCount │ 21.82 MiB │ 622.35 MiB │ 28.53 │ + │ FavoriteCount │ 280.95 KiB │ 508.40 MiB │ 1853.02 │ + │ ViewCount │ 95.77 MiB │ 736.45 MiB │ 7.69 │ + │ LastEditorUserId │ 179.47 MiB │ 736.45 MiB │ 4.1 │ + │ ContentLicense │ 5.45 MiB │ 847.92 MiB │ 155.5 │ + │ OwnerDisplayName │ 14.30 MiB │ 142.58 MiB │ 9.97 │ + │ PostTypeId │ 20.93 MiB │ 565.30 MiB │ 27 │ + │ CreationDate │ 314.17 MiB │ 964.64 MiB │ 3.07 │ + │ LastEditDate │ 346.32 MiB │ 964.64 MiB │ 2.79 │ + │ LastEditorDisplayName │ 5.46 MiB │ 124.25 MiB │ 22.75 │ + │ CommunityOwnedDate │ 2.21 MiB │ 509.60 MiB │ 230.94 │ + └───────────────────────┴─────────────────┴───────────────────┴────────────┘ + ``` + + We show both a compressed and uncompressed size here. Both are important. The compressed size equates to what we will need to read off disk - something we want to minimize for query performance (and storage cost). This data will need to be decompressed prior to reading. The size of this uncompressed size will be dependent on the data type used in this case. Minimizing this size will reduce memory overhead of queries and the amount of data which has to be processed by the query, improving utilization of caches and ultimately query times. + + > The above query relies on the table `columns` in the system database. This database is managed by ClickHouse and is a treasure trove of useful information, from query performance metrics to background cluster logs. We recommend ["System Tables and a Window into the Internals of ClickHouse"](https://clickhouse.com/blog/clickhouse-debugging-issues-with-system-tables) and accompanying articles[[1]](https://clickhouse.com/blog/monitoring-troubleshooting-insert-queries-clickhouse)[[2]](https://clickhouse.com/blog/monitoring-troubleshooting-select-queries-clickhouse) for the curious reader. + + To summarize the total size of the table, we can simplify the above query: + + ```sql + SELECT formatReadableSize(sum(data_compressed_bytes)) AS compressed_size, + formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size, + round(sum(data_uncompressed_bytes) / sum(data_compressed_bytes), 2) AS ratio + FROM system.columns + WHERE table = 'posts' -┌─compressed_size─┬─uncompressed_size─┬─ratio─┐ -│ 50.16 GiB │ 143.47 GiB │ 2.86 │ -└─────────────────┴───────────────────┴───────┘ -``` + ┌─compressed_size─┬─uncompressed_size─┬─ratio─┐ + │ 50.16 GiB │ 143.47 GiB │ 2.86 │ + └─────────────────┴───────────────────┴───────┘ + ``` -Repeating this query for the `posts_v3`, the table with an optimized type and ordering key, we can see a significant reduction in uncompressed and compressed sizes. + Repeating this query for the `posts_v3`, the table with an optimized type and ordering key, we can see a significant reduction in uncompressed and compressed sizes. -```sql -SELECT + ```sql + SELECT formatReadableSize(sum(data_compressed_bytes)) AS compressed_size, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size, round(sum(data_uncompressed_bytes) / sum(data_compressed_bytes), 2) AS ratio -FROM system.columns -WHERE `table` = 'posts_v3' + FROM system.columns + WHERE `table` = 'posts_v3' -┌─compressed_size─┬─uncompressed_size─┬─ratio─┐ -│ 25.15 GiB │ 68.87 GiB │ 2.74 │ -└─────────────────┴───────────────────┴───────┘ -``` + ┌─compressed_size─┬─uncompressed_size─┬─ratio─┐ + │ 25.15 GiB │ 68.87 GiB │ 2.74 │ + └─────────────────┴───────────────────┴───────┘ + ``` -The full column breakdown shows considerable savings for the `Body`, `Title`, `Tags` and `CreationDate` columns achieved by ordering the data prior to compression and using the appropriate types. + The full column breakdown shows considerable savings for the `Body`, `Title`, `Tags` and `CreationDate` columns achieved by ordering the data prior to compression and using the appropriate types. -```sql -SELECT + ```sql + SELECT name, formatReadableSize(sum(data_compressed_bytes)) AS compressed_size, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size, round(sum(data_uncompressed_bytes) / sum(data_compressed_bytes), 2) AS ratio -FROM system.columns -WHERE `table` = 'posts_v3' -GROUP BY name - -┌─name──────────────────┬─compressed_size─┬─uncompressed_size─┬───ratio─┐ -│ Body │ 23.10 GiB │ 63.63 GiB │ 2.75 │ -│ Title │ 614.65 MiB │ 1.28 GiB │ 2.14 │ -│ Score │ 40.28 MiB │ 227.38 MiB │ 5.65 │ -│ Tags │ 234.05 MiB │ 688.49 MiB │ 2.94 │ -│ ParentId │ 107.78 MiB │ 321.33 MiB │ 2.98 │ -│ Id │ 159.70 MiB │ 227.38 MiB │ 1.42 │ -│ AcceptedAnswerId │ 40.34 MiB │ 227.38 MiB │ 5.64 │ -│ ClosedDate │ 5.93 MiB │ 9.49 MiB │ 1.6 │ -│ LastActivityDate │ 246.55 MiB │ 454.76 MiB │ 1.84 │ -│ CommentCount │ 635.78 KiB │ 56.84 MiB │ 91.55 │ -│ OwnerUserId │ 183.86 MiB │ 227.38 MiB │ 1.24 │ -│ AnswerCount │ 9.67 MiB │ 113.69 MiB │ 11.76 │ -│ FavoriteCount │ 19.77 KiB │ 147.32 KiB │ 7.45 │ -│ ViewCount │ 45.04 MiB │ 227.38 MiB │ 5.05 │ -│ LastEditorUserId │ 86.25 MiB │ 227.38 MiB │ 2.64 │ -│ ContentLicense │ 2.17 MiB │ 57.10 MiB │ 26.37 │ -│ OwnerDisplayName │ 5.95 MiB │ 16.19 MiB │ 2.72 │ -│ PostTypeId │ 39.49 KiB │ 56.84 MiB │ 1474.01 │ -│ CreationDate │ 181.23 MiB │ 454.76 MiB │ 2.51 │ -│ LastEditDate │ 134.07 MiB │ 454.76 MiB │ 3.39 │ -│ LastEditorDisplayName │ 2.15 MiB │ 6.25 MiB │ 2.91 │ -│ CommunityOwnedDate │ 824.60 KiB │ 1.34 MiB │ 1.66 │ -└───────────────────────┴─────────────────┴───────────────────┴─────────┘ -``` + FROM system.columns + WHERE `table` = 'posts_v3' + GROUP BY name + + ┌─name──────────────────┬─compressed_size─┬─uncompressed_size─┬───ratio─┐ + │ Body │ 23.10 GiB │ 63.63 GiB │ 2.75 │ + │ Title │ 614.65 MiB │ 1.28 GiB │ 2.14 │ + │ Score │ 40.28 MiB │ 227.38 MiB │ 5.65 │ + │ Tags │ 234.05 MiB │ 688.49 MiB │ 2.94 │ + │ ParentId │ 107.78 MiB │ 321.33 MiB │ 2.98 │ + │ Id │ 159.70 MiB │ 227.38 MiB │ 1.42 │ + │ AcceptedAnswerId │ 40.34 MiB │ 227.38 MiB │ 5.64 │ + │ ClosedDate │ 5.93 MiB │ 9.49 MiB │ 1.6 │ + │ LastActivityDate │ 246.55 MiB │ 454.76 MiB │ 1.84 │ + │ CommentCount │ 635.78 KiB │ 56.84 MiB │ 91.55 │ + │ OwnerUserId │ 183.86 MiB │ 227.38 MiB │ 1.24 │ + │ AnswerCount │ 9.67 MiB │ 113.69 MiB │ 11.76 │ + │ FavoriteCount │ 19.77 KiB │ 147.32 KiB │ 7.45 │ + │ ViewCount │ 45.04 MiB │ 227.38 MiB │ 5.05 │ + │ LastEditorUserId │ 86.25 MiB │ 227.38 MiB │ 2.64 │ + │ ContentLicense │ 2.17 MiB │ 57.10 MiB │ 26.37 │ + │ OwnerDisplayName │ 5.95 MiB │ 16.19 MiB │ 2.72 │ + │ PostTypeId │ 39.49 KiB │ 56.84 MiB │ 1474.01 │ + │ CreationDate │ 181.23 MiB │ 454.76 MiB │ 2.51 │ + │ LastEditDate │ 134.07 MiB │ 454.76 MiB │ 3.39 │ + │ LastEditorDisplayName │ 2.15 MiB │ 6.25 MiB │ 2.91 │ + │ CommunityOwnedDate │ 824.60 KiB │ 1.34 MiB │ 1.66 │ + └───────────────────────┴─────────────────┴───────────────────┴─────────┘ + ``` ## Choosing the right column compression codec {#choosing-the-right-column-compression-codec} diff --git a/docs/data-modeling/backfilling.md b/docs/data-modeling/backfilling.md index 768cd380a8d..1129a8e8c60 100644 --- a/docs/data-modeling/backfilling.md +++ b/docs/data-modeling/backfilling.md @@ -76,9 +76,9 @@ We will attempt to cover the following scenarios: 1. **Backfilling data with existing data ingestion** - New data is being loaded, and historical data needs to be backfilled. This historical data has been identified. 2. **Adding materialized views to existing tables** - New materialized views need to be added to a setup for which historical data has been populated and data is already streaming. -We assume data will be backfilled from object storage. In all cases, we aim to avoid pauses in data insertion. + We assume data will be backfilled from object storage. In all cases, we aim to avoid pauses in data insertion. -We recommend backfilling historical data from object storage. Data should be exported to Parquet where possible for optimal read performance and compression (reduced network transfer). A file size of around 150MB is typically preferred, but ClickHouse supports over [70 file formats](/interfaces/formats) and is capable of handling files of all sizes. + We recommend backfilling historical data from object storage. Data should be exported to Parquet where possible for optimal read performance and compression (reduced network transfer). A file size of around 150MB is typically preferred, but ClickHouse supports over [70 file formats](/interfaces/formats) and is capable of handling files of all sizes. ## Using duplicate tables and views {#using-duplicate-tables-and-views} @@ -135,7 +135,6 @@ SELECT count() FROM pypi SELECT sum(count) FROM pypi_downloads - ┌─sum(count)─┐ │ 20612750 │ -- 20.61 million └────────────┘ @@ -271,57 +270,57 @@ This process follows the following steps: 4. Insert into our duplicate main table created in step (2). 5. Move all partitions from the duplicate tables to their original versions. Drop duplicate tables. -For example, in our PyPI data suppose we have data loaded. We can identify the minimum timestamp and, thus, our "checkpoint". + For example, in our PyPI data suppose we have data loaded. We can identify the minimum timestamp and, thus, our "checkpoint". -```sql -SELECT min(timestamp) -FROM pypi + ```sql + SELECT min(timestamp) + FROM pypi -┌──────min(timestamp)─┐ -│ 2024-12-17 09:00:00 │ -└─────────────────────┘ + ┌──────min(timestamp)─┐ + │ 2024-12-17 09:00:00 │ + └─────────────────────┘ -1 row in set. Elapsed: 0.163 sec. Processed 1.34 billion rows, 5.37 GB (8.24 billion rows/s., 32.96 GB/s.) -Peak memory usage: 227.84 MiB. -``` + 1 row in set. Elapsed: 0.163 sec. Processed 1.34 billion rows, 5.37 GB (8.24 billion rows/s., 32.96 GB/s.) + Peak memory usage: 227.84 MiB. + ``` -From the above, we know we need to load data prior to `2024-12-17 09:00:00`. Using our earlier process, we create duplicate tables and views and load the subset using a filter on the timestamp. + From the above, we know we need to load data prior to `2024-12-17 09:00:00`. Using our earlier process, we create duplicate tables and views and load the subset using a filter on the timestamp. -```sql -CREATE TABLE pypi_v2 AS pypi + ```sql + CREATE TABLE pypi_v2 AS pypi -CREATE TABLE pypi_downloads_v2 AS pypi_downloads + CREATE TABLE pypi_downloads_v2 AS pypi_downloads -CREATE MATERIALIZED VIEW pypi_downloads_mv_v2 TO pypi_downloads_v2 -AS SELECT project, count() AS count -FROM pypi_v2 -GROUP BY project + CREATE MATERIALIZED VIEW pypi_downloads_mv_v2 TO pypi_downloads_v2 + AS SELECT project, count() AS count + FROM pypi_v2 + GROUP BY project -INSERT INTO pypi_v2 SELECT * -FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/2024-12-17/1734393600-*.parquet') -WHERE timestamp < '2024-12-17 09:00:00' + INSERT INTO pypi_v2 SELECT * + FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/2024-12-17/1734393600-*.parquet') + WHERE timestamp < '2024-12-17 09:00:00' -0 rows in set. Elapsed: 500.152 sec. Processed 2.74 billion rows, 364.40 GB (5.47 million rows/s., 728.59 MB/s.) -``` -:::note -Filtering on timestamp columns in Parquet can be very efficient. ClickHouse will only read the timestamp column to identify the full data ranges to load, minimizing network traffic. Parquet indices, such as min-max, can also be exploited by the ClickHouse query engine. -::: + 0 rows in set. Elapsed: 500.152 sec. Processed 2.74 billion rows, 364.40 GB (5.47 million rows/s., 728.59 MB/s.) + ``` + :::note + Filtering on timestamp columns in Parquet can be very efficient. ClickHouse will only read the timestamp column to identify the full data ranges to load, minimizing network traffic. Parquet indices, such as min-max, can also be exploited by the ClickHouse query engine. + ::: -Once this insert is complete, we can move the associated partitions. + Once this insert is complete, we can move the associated partitions. -```sql -ALTER TABLE pypi - (MOVE PARTITION () FROM pypi_v2) + ```sql + ALTER TABLE pypi + (MOVE PARTITION () FROM pypi_v2) -ALTER TABLE pypi_downloads - (MOVE PARTITION () FROM pypi_downloads_v2) -``` + ALTER TABLE pypi_downloads + (MOVE PARTITION () FROM pypi_downloads_v2) + ``` -If the historical data is an isolated bucket, the above time filter is not required. If a time or monotonic column is unavailable, isolate your historical data. + If the historical data is an isolated bucket, the above time filter is not required. If a time or monotonic column is unavailable, isolate your historical data. -:::note Just use ClickPipes in ClickHouse Cloud -ClickHouse Cloud users should use ClickPipes for restoring historical backups if the data can be isolated in its own bucket (and a filter is not required). As well as parallelizing the load with multiple workers, thus reducing the load time, ClickPipes automates the above process - creating duplicate tables for both the main table and materialized views. -::: + :::note Just use ClickPipes in ClickHouse Cloud + ClickHouse Cloud users should use ClickPipes for restoring historical backups if the data can be isolated in its own bucket (and a filter is not required). As well as parallelizing the load with multiple workers, thus reducing the load time, ClickPipes automates the above process - creating duplicate tables for both the main table and materialized views. + ::: ## Scenario 2: Adding materialized views to existing tables {#scenario-2-adding-materialized-views-to-existing-tables} @@ -340,80 +339,79 @@ Our simplest approach involves the following steps: 1. Create our materialized view with a filter that only considers rows greater than an arbitrary time in the near future. 2. Run an `INSERT INTO SELECT` query which inserts into our materialized view's target table, reading from the source table with the views aggregation query. -This can be further enhanced to target subsets of data in step (2) and/or use a duplicate target table for the materialized view (attach partitions to the original once the insert is complete) for easier recovery after failure. + This can be further enhanced to target subsets of data in step (2) and/or use a duplicate target table for the materialized view (attach partitions to the original once the insert is complete) for easier recovery after failure. -Consider the following materialized view, which computes the most popular projects per hour. + Consider the following materialized view, which computes the most popular projects per hour. -```sql -CREATE TABLE pypi_downloads_per_day -( + ```sql + CREATE TABLE pypi_downloads_per_day + ( `hour` DateTime, `project` String, `count` Int64 -) -ENGINE = SummingMergeTree -ORDER BY (project, hour) - - -CREATE MATERIALIZED VIEW pypi_downloads_per_day_mv TO pypi_downloads_per_day -AS SELECT - toStartOfHour(timestamp) as hour, - project, + ) + ENGINE = SummingMergeTree + ORDER BY (project, hour) + + CREATE MATERIALIZED VIEW pypi_downloads_per_day_mv TO pypi_downloads_per_day + AS SELECT + toStartOfHour(timestamp) as hour, + project, count() AS count -FROM pypi -GROUP BY + FROM pypi + GROUP BY hour, - project -``` + project + ``` -While we can add the target table, prior to adding the materialized view we modify its `SELECT` clause to include a filer which only considers rows greater than an arbitrary time in the near future - in this case we assume `2024-12-17 09:00:00` is a few minutes in the future. + While we can add the target table, prior to adding the materialized view we modify its `SELECT` clause to include a filer which only considers rows greater than an arbitrary time in the near future - in this case we assume `2024-12-17 09:00:00` is a few minutes in the future. -```sql -CREATE MATERIALIZED VIEW pypi_downloads_per_day_mv TO pypi_downloads_per_day -AS SELECT - toStartOfHour(timestamp) AS hour, - project, count() AS count -FROM pypi WHERE timestamp >= '2024-12-17 09:00:00' -GROUP BY hour, project -``` + ```sql + CREATE MATERIALIZED VIEW pypi_downloads_per_day_mv TO pypi_downloads_per_day + AS SELECT + toStartOfHour(timestamp) AS hour, + project, count() AS count + FROM pypi WHERE timestamp >= '2024-12-17 09:00:00' + GROUP BY hour, project + ``` -Once this view is added, we can backfill all data for the materialized view prior to this data. + Once this view is added, we can backfill all data for the materialized view prior to this data. -The simplest means of doing this is to simply run the query from the materialized view on the main table with a filter that ignores recently added data, inserting the results into our view's target table via an `INSERT INTO SELECT`. For example, for the above view: + The simplest means of doing this is to simply run the query from the materialized view on the main table with a filter that ignores recently added data, inserting the results into our view's target table via an `INSERT INTO SELECT`. For example, for the above view: -```sql -INSERT INTO pypi_downloads_per_day SELECT - toStartOfHour(timestamp) AS hour, - project, + ```sql + INSERT INTO pypi_downloads_per_day SELECT + toStartOfHour(timestamp) AS hour, + project, count() AS count -FROM pypi -WHERE timestamp < '2024-12-17 09:00:00' -GROUP BY + FROM pypi + WHERE timestamp < '2024-12-17 09:00:00' + GROUP BY hour, - project + project -Ok. + Ok. -0 rows in set. Elapsed: 2.830 sec. Processed 798.89 million rows, 17.40 GB (282.28 million rows/s., 6.15 GB/s.) -Peak memory usage: 543.71 MiB. -``` + 0 rows in set. Elapsed: 2.830 sec. Processed 798.89 million rows, 17.40 GB (282.28 million rows/s., 6.15 GB/s.) + Peak memory usage: 543.71 MiB. + ``` -:::note -In the above example our target table is a [SummingMergeTree](/engines/table-engines/mergetree-family/summingmergetree). In this case we can simply use our original aggregation query. For more complex use cases which exploit the [AggregatingMergeTree](/engines/table-engines/mergetree-family/aggregatingmergetree), users will use `-State` functions for the aggregates. An example of this can be found [here](/integrations/s3/performance#be-aware-of-merges). -::: + :::note + In the above example our target table is a [SummingMergeTree](/engines/table-engines/mergetree-family/summingmergetree). In this case we can simply use our original aggregation query. For more complex use cases which exploit the [AggregatingMergeTree](/engines/table-engines/mergetree-family/aggregatingmergetree), users will use `-State` functions for the aggregates. An example of this can be found [here](/integrations/s3/performance#be-aware-of-merges). + ::: -In our case, this is a relatively lightweight aggregation that completes in under 3s and uses less than 600MiB of memory. For more complex or longer-running aggregations, users can make this process more resilient by using the earlier duplicate table approach i.e. create a shadow target table, e.g., `pypi_downloads_per_day_v2`, insert into this, and attach its resulting partitions to `pypi_downloads_per_day`. + In our case, this is a relatively lightweight aggregation that completes in under 3s and uses less than 600MiB of memory. For more complex or longer-running aggregations, users can make this process more resilient by using the earlier duplicate table approach i.e. create a shadow target table, e.g., `pypi_downloads_per_day_v2`, insert into this, and attach its resulting partitions to `pypi_downloads_per_day`. -Often materialized view's query can be more complex (not uncommon as otherwise users wouldn't use a view!) and consume resources. In rarer cases, the resources for the query are beyond that of the server. This highlights one of the advantages of ClickHouse materialized views - they are incremental and don't process the entire dataset in one go! + Often materialized view's query can be more complex (not uncommon as otherwise users wouldn't use a view!) and consume resources. In rarer cases, the resources for the query are beyond that of the server. This highlights one of the advantages of ClickHouse materialized views - they are incremental and don't process the entire dataset in one go! -In this case, users have several options: + In this case, users have several options: 1. Modify your query to backfill ranges e.g. `WHERE timestamp BETWEEN 2024-12-17 08:00:00 AND 2024-12-17 09:00:00`, `WHERE timestamp BETWEEN 2024-12-17 07:00:00 AND 2024-12-17 08:00:00` etc. 2. Use a [Null table engine](/engines/table-engines/special/null) to fill the materialized view. This replicates the typical incremental population of a materialized view, executing it's query over blocks of data (of configurable size). -(1) represents the simplest approach is often sufficient. We do not include examples for brevity. + (1) represents the simplest approach is often sufficient. We do not include examples for brevity. -We explore (2) further below. + We explore (2) further below. #### Using a Null table engine for filling materialized views {#using-a-null-table-engine-for-filling-materialized-views} @@ -472,56 +470,56 @@ Several factors will determine the performance and resources used in the above s - **Insert Block Size** - data is processed in a loop where it is pulled, parsed, and formed into in-memory insert blocks based on the [partitioning key](/engines/table-engines/mergetree-family/custom-partitioning-key). These blocks are sorted, optimized, compressed, and written to storage as new [data parts](/parts). The size of the insert block, controlled by settings [`min_insert_block_size_rows`](/operations/settings/settings#min_insert_block_size_rows) and [`min_insert_block_size_bytes`](/operations/settings/settings#min_insert_block_size_bytes) (uncompressed), impacts memory usage and disk I/O. Larger blocks use more memory but create fewer parts, reducing I/O and background merges. These settings represent minimum thresholds (whichever is reached first triggers a flush). - **Materialized view block size** - As well as the above mechanics for the main insert, prior to insertion into materialized views, blocks are also squashed for more efficient processing. The size of these blocks is determined by the settings [`min_insert_block_size_bytes_for_materialized_views`](/operations/settings/settings#min_insert_block_size_bytes_for_materialized_views) and [`min_insert_block_size_rows_for_materialized_views`](/operations/settings/settings#min_insert_block_size_rows_for_materialized_views). Larger blocks allow more efficient processing at the expense of greater memory usage. By default, these settings revert to the values of the source table settings [`min_insert_block_size_rows`](/operations/settings/settings#min_insert_block_size_rows) and [`min_insert_block_size_bytes`](/operations/settings/settings#min_insert_block_size_bytes), respectively. -For improving performance, users can follow the guidelines outlined in the [Tuning Threads and Block Size for Inserts](/integrations/s3/performance#tuning-threads-and-block-size-for-inserts) section of the [Optimizing for S3 Insert and Read Performance guide](/integrations/s3/performance). It should not be necessary to also modify `min_insert_block_size_bytes_for_materialized_views` and `min_insert_block_size_rows_for_materialized_views` to improve performance in most cases. If these are modified, use the same best practices as discussed for `min_insert_block_size_rows` and `min_insert_block_size_bytes`. + For improving performance, users can follow the guidelines outlined in the [Tuning Threads and Block Size for Inserts](/integrations/s3/performance#tuning-threads-and-block-size-for-inserts) section of the [Optimizing for S3 Insert and Read Performance guide](/integrations/s3/performance). It should not be necessary to also modify `min_insert_block_size_bytes_for_materialized_views` and `min_insert_block_size_rows_for_materialized_views` to improve performance in most cases. If these are modified, use the same best practices as discussed for `min_insert_block_size_rows` and `min_insert_block_size_bytes`. -To minimize memory, users may wish to experiment with these settings. This will invariably lower performance. Using the earlier query, we show examples below. + To minimize memory, users may wish to experiment with these settings. This will invariably lower performance. Using the earlier query, we show examples below. -Lowering `max_insert_threads` to 1 reduces our memory overhead. + Lowering `max_insert_threads` to 1 reduces our memory overhead. -```sql -INSERT INTO pypi_v2 -SELECT + ```sql + INSERT INTO pypi_v2 + SELECT timestamp, - project -FROM pypi -WHERE timestamp < '2024-12-17 09:00:00' -SETTINGS max_insert_threads = 1 + project + FROM pypi + WHERE timestamp < '2024-12-17 09:00:00' + SETTINGS max_insert_threads = 1 -0 rows in set. Elapsed: 27.752 sec. Processed 1.50 billion rows, 33.48 GB (53.89 million rows/s., 1.21 GB/s.) -Peak memory usage: 506.78 MiB. -``` + 0 rows in set. Elapsed: 27.752 sec. Processed 1.50 billion rows, 33.48 GB (53.89 million rows/s., 1.21 GB/s.) + Peak memory usage: 506.78 MiB. + ``` -We can lower memory further by reducing our `max_threads` setting to 1. + We can lower memory further by reducing our `max_threads` setting to 1. -```sql -INSERT INTO pypi_v2 -SELECT timestamp, project -FROM pypi -WHERE timestamp < '2024-12-17 09:00:00' -SETTINGS max_insert_threads = 1, max_threads = 1 + ```sql + INSERT INTO pypi_v2 + SELECT timestamp, project + FROM pypi + WHERE timestamp < '2024-12-17 09:00:00' + SETTINGS max_insert_threads = 1, max_threads = 1 -Ok. + Ok. -0 rows in set. Elapsed: 43.907 sec. Processed 1.50 billion rows, 33.48 GB (34.06 million rows/s., 762.54 MB/s.) -Peak memory usage: 272.53 MiB. -``` + 0 rows in set. Elapsed: 43.907 sec. Processed 1.50 billion rows, 33.48 GB (34.06 million rows/s., 762.54 MB/s.) + Peak memory usage: 272.53 MiB. + ``` -Finally, we can reduce memory further by setting `min_insert_block_size_rows` to 0 (disables it as a deciding factor on block size) and `min_insert_block_size_bytes` to 10485760 (10MiB). + Finally, we can reduce memory further by setting `min_insert_block_size_rows` to 0 (disables it as a deciding factor on block size) and `min_insert_block_size_bytes` to 10485760 (10MiB). -```sql -INSERT INTO pypi_v2 -SELECT + ```sql + INSERT INTO pypi_v2 + SELECT timestamp, - project -FROM pypi -WHERE timestamp < '2024-12-17 09:00:00' -SETTINGS max_insert_threads = 1, max_threads = 1, min_insert_block_size_rows = 0, min_insert_block_size_bytes = 10485760 + project + FROM pypi + WHERE timestamp < '2024-12-17 09:00:00' + SETTINGS max_insert_threads = 1, max_threads = 1, min_insert_block_size_rows = 0, min_insert_block_size_bytes = 10485760 -0 rows in set. Elapsed: 43.293 sec. Processed 1.50 billion rows, 33.48 GB (34.54 million rows/s., 773.36 MB/s.) -Peak memory usage: 218.64 MiB. -``` + 0 rows in set. Elapsed: 43.293 sec. Processed 1.50 billion rows, 33.48 GB (34.54 million rows/s., 773.36 MB/s.) + Peak memory usage: 218.64 MiB. + ``` -Finally, be aware that lowering block sizes produces more parts and causes greater merge pressure. As discussed [here](/integrations/s3/performance#be-aware-of-merges), these settings should be changed cautiously. + Finally, be aware that lowering block sizes produces more parts and causes greater merge pressure. As discussed [here](/integrations/s3/performance#be-aware-of-merges), these settings should be changed cautiously. ### No timestamp or monotonically increasing column {#no-timestamp-or-monotonically-increasing-column} @@ -534,94 +532,93 @@ The above processes rely on the user have a timestamp or monotonically increasin 5. Restart inserts. **Note:** Inserts will only update the target table, and not the duplicate, which will reference only the original data. 6. Backfill the materialized view, applying the same process used above for data with timestamps, using the duplicate table as the source. -Consider the following example using PyPI and our previous new materialized view `pypi_downloads_per_day` (we'll assume we can't use the timestamp): + Consider the following example using PyPI and our previous new materialized view `pypi_downloads_per_day` (we'll assume we can't use the timestamp): -```sql -SELECT count() FROM pypi + ```sql + SELECT count() FROM pypi -┌────count()─┐ -│ 2039988137 │ -- 2.04 billion -└────────────┘ + ┌────count()─┐ + │ 2039988137 │ -- 2.04 billion + └────────────┘ -1 row in set. Elapsed: 0.003 sec. + 1 row in set. Elapsed: 0.003 sec. --- (1) Pause inserts --- (2) Create a duplicate of our target table + -- (1) Pause inserts + -- (2) Create a duplicate of our target table -CREATE TABLE pypi_v2 AS pypi + CREATE TABLE pypi_v2 AS pypi -SELECT count() FROM pypi_v2 + SELECT count() FROM pypi_v2 -┌────count()─┐ -│ 2039988137 │ -- 2.04 billion -└────────────┘ + ┌────count()─┐ + │ 2039988137 │ -- 2.04 billion + └────────────┘ -1 row in set. Elapsed: 0.004 sec. + 1 row in set. Elapsed: 0.004 sec. --- (3) Attach partitions from the original target table to the duplicate. + -- (3) Attach partitions from the original target table to the duplicate. -ALTER TABLE pypi_v2 - (ATTACH PARTITION tuple() FROM pypi) + ALTER TABLE pypi_v2 + (ATTACH PARTITION tuple() FROM pypi) --- (4) Create our new materialized views + -- (4) Create our new materialized views -CREATE TABLE pypi_downloads_per_day -( + CREATE TABLE pypi_downloads_per_day + ( `hour` DateTime, `project` String, `count` Int64 -) -ENGINE = SummingMergeTree -ORDER BY (project, hour) - - -CREATE MATERIALIZED VIEW pypi_downloads_per_day_mv TO pypi_downloads_per_day -AS SELECT - toStartOfHour(timestamp) as hour, - project, + ) + ENGINE = SummingMergeTree + ORDER BY (project, hour) + + CREATE MATERIALIZED VIEW pypi_downloads_per_day_mv TO pypi_downloads_per_day + AS SELECT + toStartOfHour(timestamp) as hour, + project, count() AS count -FROM pypi -GROUP BY + FROM pypi + GROUP BY hour, - project + project --- (4) Restart inserts. We replicate here by inserting a single row. + -- (4) Restart inserts. We replicate here by inserting a single row. -INSERT INTO pypi SELECT * -FROM pypi -LIMIT 1 + INSERT INTO pypi SELECT * + FROM pypi + LIMIT 1 -SELECT count() FROM pypi + SELECT count() FROM pypi -┌────count()─┐ -│ 2039988138 │ -- 2.04 billion -└────────────┘ + ┌────count()─┐ + │ 2039988138 │ -- 2.04 billion + └────────────┘ -1 row in set. Elapsed: 0.003 sec. + 1 row in set. Elapsed: 0.003 sec. --- notice how pypi_v2 contains same number of rows as before + -- notice how pypi_v2 contains same number of rows as before -SELECT count() FROM pypi_v2 -┌────count()─┐ -│ 2039988137 │ -- 2.04 billion -└────────────┘ + SELECT count() FROM pypi_v2 + ┌────count()─┐ + │ 2039988137 │ -- 2.04 billion + └────────────┘ --- (5) Backfill the view using the backup pypi_v2 + -- (5) Backfill the view using the backup pypi_v2 -INSERT INTO pypi_downloads_per_day SELECT - toStartOfHour(timestamp) as hour, - project, + INSERT INTO pypi_downloads_per_day SELECT + toStartOfHour(timestamp) as hour, + project, count() AS count -FROM pypi_v2 -GROUP BY + FROM pypi_v2 + GROUP BY hour, - project + project -0 rows in set. Elapsed: 3.719 sec. Processed 2.04 billion rows, 47.15 GB (548.57 million rows/s., 12.68 GB/s.) + 0 rows in set. Elapsed: 3.719 sec. Processed 2.04 billion rows, 47.15 GB (548.57 million rows/s., 12.68 GB/s.) -DROP TABLE pypi_v2; -``` + DROP TABLE pypi_v2; + ``` -In the penultimate step we backfill `pypi_downloads_per_day` using our simple `INSERT INTO SELECT` approach described [earlier](#timestamp-or-monotonically-increasing-column-available). This can also be enhanced using the Null table approach documented [above](#using-a-null-table-engine-for-filling-materialized-views), with the optional use of a duplicate table for more resiliency. + In the penultimate step we backfill `pypi_downloads_per_day` using our simple `INSERT INTO SELECT` approach described [earlier](#timestamp-or-monotonically-increasing-column-available). This can also be enhanced using the Null table approach documented [above](#using-a-null-table-engine-for-filling-materialized-views), with the optional use of a duplicate table for more resiliency. -While this operation does require inserts to be paused, the intermediate operations can typically be completed quickly - minimizing any data interruption. + While this operation does require inserts to be paused, the intermediate operations can typically be completed quickly - minimizing any data interruption. diff --git a/docs/data-modeling/denormalization.md b/docs/data-modeling/denormalization.md index 826057fde6d..8b8f7d6cbff 100644 --- a/docs/data-modeling/denormalization.md +++ b/docs/data-modeling/denormalization.md @@ -34,9 +34,9 @@ In general, we would recommend denormalizing in the following cases: - Avoid denormalizing high cardinality relationships. If each row in a table has thousands of related entries in another table, these will need to be represented as an `Array` - either of a primitive type or tuples. Generally, arrays with more than 1000 tuples would not be recommended. - Rather than denormalizing all columns as nested objects, consider denormalizing just a statistic using materialized views (see below). -All information doesn't need to be denormalized - just the key information that needs to be frequently accessed. + All information doesn't need to be denormalized - just the key information that needs to be frequently accessed. -The denormalization work can be handled in either ClickHouse or upstream e.g. using Apache Flink. + The denormalization work can be handled in either ClickHouse or upstream e.g. using Apache Flink. ## Avoid denormalization on frequently updated data {#avoid-denormalization-on-frequently-updated-data} @@ -49,9 +49,7 @@ Achieving this in real-time is often unrealistic and requires significant engine 1. Triggering the correct join statements when a table row changes. This should ideally not cause all objects for the join to be updated - rather just those that have been impacted. Modifying the joins to filter to the correct rows efficiently, and achieving this under high throughput, requires external tooling or engineering. 1. Row updates in ClickHouse need to be carefully managed, introducing additional complexity. -
- -A batch update process is thus more common, where all of the denormalized objects are periodically reloaded. + A batch update process is thus more common, where all of the denormalized objects are periodically reloaded. ## Practical cases for denormalization {#practical-cases-for-denormalization} @@ -297,63 +295,63 @@ In cases of complex objects or one-to-many relationships, users can use: - Named Tuples - These allow a related structure to be represented as a set of columns. - Array(Tuple) or Nested - An array of named tuples, also known as Nested, with each entry representing an object. Applicable to one-to-many relationships. -As an example, we demonstrate denormalizing `PostLinks` on to `Posts` below. + As an example, we demonstrate denormalizing `PostLinks` on to `Posts` below. -Each post can contain a number of links to other posts as shown in the `PostLinks` schema earlier. As a Nested type, we might represent these linked and duplicates posts as follows: + Each post can contain a number of links to other posts as shown in the `PostLinks` schema earlier. As a Nested type, we might represent these linked and duplicates posts as follows: -```sql -SET flatten_nested=0 -CREATE TABLE posts_with_links -( - `Id` Int32 CODEC(Delta(4), ZSTD(1)), - ... -other columns - `LinkedPosts` Nested(CreationDate DateTime64(3, 'UTC'), PostId Int32), - `DuplicatePosts` Nested(CreationDate DateTime64(3, 'UTC'), PostId Int32), -) ENGINE = MergeTree -ORDER BY (PostTypeId, toDate(CreationDate), CommentCount) -``` + ```sql + SET flatten_nested=0 + CREATE TABLE posts_with_links + ( + `Id` Int32 CODEC(Delta(4), ZSTD(1)), + ... -other columns + `LinkedPosts` Nested(CreationDate DateTime64(3, 'UTC'), PostId Int32), + `DuplicatePosts` Nested(CreationDate DateTime64(3, 'UTC'), PostId Int32), + ) ENGINE = MergeTree + ORDER BY (PostTypeId, toDate(CreationDate), CommentCount) + ``` -> Note the use of the setting `flatten_nested=0`. We recommend disabling the flattening of nested data. + > Note the use of the setting `flatten_nested=0`. We recommend disabling the flattening of nested data. -We can perform this denormalization using an `INSERT INTO SELECT` with an `OUTER JOIN` query: + We can perform this denormalization using an `INSERT INTO SELECT` with an `OUTER JOIN` query: -```sql -INSERT INTO posts_with_links -SELECT + ```sql + INSERT INTO posts_with_links + SELECT posts.*, arrayMap(p -> (p.1, p.2), arrayFilter(p -> p.3 = 'Linked' AND p.2 != 0, Related)) AS LinkedPosts, arrayMap(p -> (p.1, p.2), arrayFilter(p -> p.3 = 'Duplicate' AND p.2 != 0, Related)) AS DuplicatePosts -FROM posts -LEFT JOIN ( + FROM posts + LEFT JOIN ( SELECT PostId, groupArray((CreationDate, RelatedPostId, LinkTypeId)) AS Related FROM postlinks GROUP BY PostId -) AS postlinks ON posts.Id = postlinks.PostId + ) AS postlinks ON posts.Id = postlinks.PostId -0 rows in set. Elapsed: 155.372 sec. Processed 66.37 million rows, 76.33 GB (427.18 thousand rows/s., 491.25 MB/s.) -Peak memory usage: 6.98 GiB. -``` + 0 rows in set. Elapsed: 155.372 sec. Processed 66.37 million rows, 76.33 GB (427.18 thousand rows/s., 491.25 MB/s.) + Peak memory usage: 6.98 GiB. + ``` -> Note the timing here. We've managed to denormalize 66m rows in around 2mins. As we'll see later, this is an operation we can schedule. + > Note the timing here. We've managed to denormalize 66m rows in around 2mins. As we'll see later, this is an operation we can schedule. -Note the use of the `groupArray` functions to collapse the `PostLinks` down into an array for each `PostId`, prior to joining. This array is then filtered into two sublists: `LinkedPosts` and `DuplicatePosts`, which also exclude any empty results from the outer join. + Note the use of the `groupArray` functions to collapse the `PostLinks` down into an array for each `PostId`, prior to joining. This array is then filtered into two sublists: `LinkedPosts` and `DuplicatePosts`, which also exclude any empty results from the outer join. -We can select some rows to see our new denormalized structure: + We can select some rows to see our new denormalized structure: -```sql -SELECT LinkedPosts, DuplicatePosts -FROM posts_with_links -WHERE (length(LinkedPosts) > 2) AND (length(DuplicatePosts) > 0) -LIMIT 1 -FORMAT Vertical - -Row 1: -────── -LinkedPosts: [('2017-04-11 11:53:09.583',3404508),('2017-04-11 11:49:07.680',3922739),('2017-04-11 11:48:33.353',33058004)] -DuplicatePosts: [('2017-04-11 12:18:37.260',3922739),('2017-04-11 12:18:37.260',33058004)] -``` + ```sql + SELECT LinkedPosts, DuplicatePosts + FROM posts_with_links + WHERE (length(LinkedPosts) > 2) AND (length(DuplicatePosts) > 0) + LIMIT 1 + FORMAT Vertical + + Row 1: + ────── + LinkedPosts: [('2017-04-11 11:53:09.583',3404508),('2017-04-11 11:49:07.680',3922739),('2017-04-11 11:48:33.353',33058004)] + DuplicatePosts: [('2017-04-11 12:18:37.260',3922739),('2017-04-11 12:18:37.260',33058004)] + ``` ## Orchestrating and scheduling denormalization {#orchestrating-and-scheduling-denormalization} @@ -368,7 +366,6 @@ Users have several options for orchestrating this in ClickHouse, assuming a peri - **[Refreshable Materialized Views](/materialized-view/refreshable-materialized-view)** - Refreshable materialized views can be used to periodically schedule a query with the results sent to a target table. On query execution, the view ensures the target table is atomically updated. This provides a ClickHouse native means of scheduling this work. - **External tooling** - Utilizing tools such as [dbt](https://www.getdbt.com/) and [Airflow](https://airflow.apache.org/) to periodically schedule the transformation. The [ClickHouse integration for dbt](/integrations/dbt) ensures this is performed atomically with a new version of the target table created and then atomically swapped with the version receiving queries (via the [EXCHANGE](/sql-reference/statements/exchange) command). - ### Streaming {#streaming} Users may alternatively wish to perform this outside of ClickHouse, prior to insertion, using streaming technologies such as [Apache Flink](https://flink.apache.org/). Alternatively, incremental [materialized views](/guides/developer/cascading-materialized-views) can be used to perform this process as data is inserted. diff --git a/docs/data-modeling/index.md b/docs/data-modeling/index.md index 43ed151580c..bf4c77c2b8f 100644 --- a/docs/data-modeling/index.md +++ b/docs/data-modeling/index.md @@ -5,7 +5,7 @@ description: 'Overview of Data Modelling' keywords: ['data modelling', 'schema design', 'dictionary', 'materialized view', 'data compression', 'denormalizing data'] --- -# Data Modeling +# Data Modeling This section is about data modeling in ClickHouse and contains the following topics: diff --git a/docs/data-modeling/projections/1_projections.md b/docs/data-modeling/projections/1_projections.md index 48654ca248a..933f01040a9 100644 --- a/docs/data-modeling/projections/1_projections.md +++ b/docs/data-modeling/projections/1_projections.md @@ -23,88 +23,87 @@ queries by creating a reordering of data by attributes of interest. This can be: 1. A complete reordering 2. A subset of the original table with a different order 3. A precomputed aggregation (similar to a materialized view) but with an ordering - aligned to the aggregation. + aligned to the aggregation. -
- + ## How do Projections work? {#how-do-projections-work} Practically, a Projection can be thought of as an additional, hidden table to the -original table. The projection can have a different row order, and therefore a -different primary index, to that of the original table and it can automatically -and incrementally pre-compute aggregate values. As a result, using Projections +original table. The projection can have a different row order, and therefore a +different primary index, to that of the original table and it can automatically +and incrementally pre-compute aggregate values. As a result, using Projections provide two "tuning knobs" for speeding up query execution: - **Properly using primary indexes** - **Pre-computing aggregates** -Projections are in some ways similar to [Materialized Views](/materialized-views) -, which also allow you to have multiple row orders and pre-compute aggregations -at insert time. -Projections are automatically updated and -kept in-sync with the original table, unlike Materialized Views, which are -explicitly updated. When a query targets the original table, -ClickHouse automatically samples the primary keys and chooses a table that can -generate the same correct result, but requires the least amount of data to be -read as shown in the figure below: + Projections are in some ways similar to [Materialized Views](/materialized-views) + , which also allow you to have multiple row orders and pre-compute aggregations + at insert time. + Projections are automatically updated and + kept in-sync with the original table, unlike Materialized Views, which are + explicitly updated. When a query targets the original table, + ClickHouse automatically samples the primary keys and chooses a table that can + generate the same correct result, but requires the least amount of data to be + read as shown in the figure below: -Projections in ClickHouse + Projections in ClickHouse ### Smarter storage with `_part_offset` {#smarter_storage_with_part_offset} -Since version 25.5, ClickHouse supports the virtual column `_part_offset` in +Since version 25.5, ClickHouse supports the virtual column `_part_offset` in projections which offers a new way to define a projection. There are now two ways to define a projection: -- **Store full columns (the original behavior)**: The projection contains full - data and can be read directly, offering faster performance when filters match - the projection’s sort order. +- **Store full columns (the original behavior)**: The projection contains full + data and can be read directly, offering faster performance when filters match + the projection’s sort order. -- **Store only the sorting key + `_part_offset`**: The projection works like an index. - ClickHouse uses the projection’s primary index to locate matching rows, but reads the - actual data from the base table. This reduces storage overhead at the cost of - slightly more I/O at query time. +- **Store only the sorting key + `_part_offset`**: The projection works like an index. + ClickHouse uses the projection’s primary index to locate matching rows, but reads the + actual data from the base table. This reduces storage overhead at the cost of + slightly more I/O at query time. -The approaches above can also be mixed, storing some columns in the projection and -others indirectly via `_part_offset`. + The approaches above can also be mixed, storing some columns in the projection and + others indirectly via `_part_offset`. ## When to use Projections? {#when-to-use-projections} -Projections are an appealing feature for new users as they are automatically -maintained as data is inserted. Furthermore, queries can just be sent to a -single table where the projections are exploited where possible to speed up +Projections are an appealing feature for new users as they are automatically +maintained as data is inserted. Furthermore, queries can just be sent to a +single table where the projections are exploited where possible to speed up the response time. -This is in contrast to Materialized Views, where the user has to select the -appropriate optimized target table or rewrite their query, depending on the -filters. This places greater emphasis on user applications and increases +This is in contrast to Materialized Views, where the user has to select the +appropriate optimized target table or rewrite their query, depending on the +filters. This places greater emphasis on user applications and increases client-side complexity. Despite these advantages, projections come with some inherent limitations which users should be aware of and thus should be deployed sparingly. -- Projections don't allow using different TTL for the source table and the - (hidden) target table, materialized views allow different TTLs. +- Projections don't allow using different TTL for the source table and the + (hidden) target table, materialized views allow different TTLs. - Lightweight updates and deletes are not supported for tables with projections. -- Materialized Views can be chained: the target table of one materialized view - can be the source table of another materialized view, and so on. This is not - possible with projections. +- Materialized Views can be chained: the target table of one materialized view + can be the source table of another materialized view, and so on. This is not + possible with projections. - Projections don't support joins, but Materialized Views do. - Projections don't support filters (`WHERE` clause), but Materialized Views do. -We recommend using projections when: + We recommend using projections when: -- A complete re-ordering of the data is required. While the expression in the - projection can, in theory, use a `GROUP BY,` materialized views are more - effective for maintaining aggregates. The query optimizer is also more likely - to exploit projections that use a simple reordering, i.e., `SELECT * ORDER BY x`. - Users can select a subset of columns in this expression to reduce storage - footprint. -- Users are comfortable with the potential associated increase in storage footprint and - overhead of writing data twice. Test the impact on insertion speed and - [evaluate the storage overhead](/data-compression/compression-in-clickhouse). +- A complete re-ordering of the data is required. While the expression in the + projection can, in theory, use a `GROUP BY,` materialized views are more + effective for maintaining aggregates. The query optimizer is also more likely + to exploit projections that use a simple reordering, i.e., `SELECT * ORDER BY x`. + Users can select a subset of columns in this expression to reduce storage + footprint. +- Users are comfortable with the potential associated increase in storage footprint and + overhead of writing data twice. Test the impact on insertion speed and + [evaluate the storage overhead](/data-compression/compression-in-clickhouse). ## Examples {#examples} @@ -115,10 +114,10 @@ We'll also look at how the projection can be used to speed up queries which filt on columns which are not in the primary key of a table. For this example, we'll be using the New York Taxi Data -dataset available at [sql.clickhouse.com](https://sql.clickhouse.com/) which is ordered +dataset available at [sql.clickhouse.com](https://sql.clickhouse.com/) which is ordered by `pickup_datetime`. -Let's write a simple query to find all the trip IDs for which passengers +Let's write a simple query to find all the trip IDs for which passengers tipped their driver greater than $200: ```sql runnable @@ -130,7 +129,7 @@ FROM nyc_taxi.trips WHERE tip_amount > 200 AND trip_duration_min > 0 ORDER BY tip_amount, trip_id ASC ``` -Notice that because we are filtering on `tip_amount` which is not in the `ORDER BY`, ClickHouse +Notice that because we are filtering on `tip_amount` which is not in the `ORDER BY`, ClickHouse had to do a full table scan. Let's speed this query up. So as to preserve the original table and results, we'll create a new table and copy the data using an `INSERT INTO SELECT`: @@ -152,7 +151,7 @@ ADD PROJECTION prj_tip_amount ) ``` -It is necessary after adding a projection to use the `MATERIALIZE PROJECTION` +It is necessary after adding a projection to use the `MATERIALIZE PROJECTION` statement so that the data in it is physically ordered and rewritten according to the specified query above: @@ -178,8 +177,8 @@ We can confirm that our query above did indeed use the projection we made by querying the `system.query_log` table: ```sql -SELECT query, projections -FROM system.query_log +SELECT query, projections +FROM system.query_log WHERE query_id='' ``` @@ -196,9 +195,9 @@ WHERE query_id='' ### Using projections to speed up UK price paid queries {#using-projections-to-speed-up-UK-price-paid} To demonstrate how projections can be used to speed up query performance, let's -take a look at an example using a real life dataset. For this example we'll be +take a look at an example using a real life dataset. For this example we'll be using the table from our [UK Property Price Paid](https://clickhouse.com/docs/getting-started/example-datasets/uk-price-paid) -tutorial with 30.03 million rows. This dataset is also available within our +tutorial with 30.03 million rows. This dataset is also available within our [sql.clickhouse.com](https://sql.clickhouse.com/?query_id=6IDMHK3OMR1C97J6M9EUQS) environment. @@ -229,7 +228,7 @@ ORDER BY avg(price) DESC LIMIT 3 ``` -Notice that despite being very fast how a full table scan of all 30.03 million rows occurred for both queries, due +Notice that despite being very fast how a full table scan of all 30.03 million rows occurred for both queries, due to the fact that neither `town` nor `price` were in our `ORDER BY` statement when we created the table: @@ -252,9 +251,9 @@ CREATE TABLE uk.uk_price_paid_with_projections AS uk_price_paid; INSERT INTO uk.uk_price_paid_with_projections SELECT * FROM uk.uk_price_paid; ``` -We create and populate projection `prj_oby_town_price` which produces an -additional (hidden) table with a primary index, ordering by town and price, to -optimize the query that lists the counties in a specific town for the highest +We create and populate projection `prj_oby_town_price` which produces an +additional (hidden) table with a primary index, ordering by town and price, to +optimize the query that lists the counties in a specific town for the highest paid prices: ```sql @@ -299,8 +298,8 @@ SETTINGS mutations_sync = 1 :::note If there is a `GROUP BY` clause used in a projection like in the `prj_gby_county` -projection above, then the underlying storage engine for the (hidden) table -becomes `AggregatingMergeTree`, and all aggregate functions are converted to +projection above, then the underlying storage engine for the (hidden) table +becomes `AggregatingMergeTree`, and all aggregate functions are converted to `AggregateFunction`. This ensures proper incremental data aggregation. ::: @@ -309,7 +308,7 @@ and its two projections: Visualization of the main table uk_price_paid_with_projections and its two projections -If we now run the query that lists the counties in London for the three highest +If we now run the query that lists the counties in London for the three highest paid prices again, we see an improvement in query performance: ```sql runnable @@ -322,7 +321,7 @@ ORDER BY price DESC LIMIT 3 ``` -Likewise, for the query that lists the U.K. counties with the three highest +Likewise, for the query that lists the U.K. counties with the three highest average-paid prices: ```sql runnable @@ -336,18 +335,18 @@ LIMIT 3 ``` Note that both queries target the original table, and that both queries resulted -in a full table scan (all 30.03 million rows got streamed from disk) before we +in a full table scan (all 30.03 million rows got streamed from disk) before we created the two projections. Also, note that the query that lists the counties in London for the three highest paid prices is streaming 2.17 million rows. When we directly used a second table optimized for this query, only 81.92 thousand rows were streamed from disk. -The reason for the difference is that currently, the `optimize_read_in_order` +The reason for the difference is that currently, the `optimize_read_in_order` optimization mentioned above isn't supported for projections. -We inspect the `system.query_log` table to see that ClickHouse -automatically used the two projections for the two queries above (see the +We inspect the `system.query_log` table to see that ClickHouse +automatically used the two projections for the two queries above (see the projections column below): ```sql @@ -465,7 +464,6 @@ ORDER BY year ASC ``` The results should be the same, but the performance better on the latter example! - #### Query 2. Average price per year in London {#average-price-london-projections} ```sql runnable @@ -480,7 +478,6 @@ ORDER BY year ASC SETTINGS optimize_use_projections=0 ``` - ```sql runnable SELECT toYear(date) AS year, @@ -535,16 +532,16 @@ Again, the result is the same but notice the improvement in query performance fo ### Combining projections in one query {#combining-projections} -Starting in version 25.6, building on the `_part_offset` support introduced in -the previous version, ClickHouse can now use multiple projections to accelerate +Starting in version 25.6, building on the `_part_offset` support introduced in +the previous version, ClickHouse can now use multiple projections to accelerate a single query with multiple filters. -Importantly, ClickHouse still reads data from only one projection (or the base table), +Importantly, ClickHouse still reads data from only one projection (or the base table), but can use other projections' primary indexes to prune unnecessary parts before reading. -This is especially useful for queries that filter on multiple columns, each +This is especially useful for queries that filter on multiple columns, each potentially matching a different projection. -> Currently, this mechanism only prunes entire parts. Granule-level pruning is +> Currently, this mechanism only prunes entire parts. Granule-level pruning is not yet supported. To demonstrate this, we define the table (with projections using `_part_offset` columns) @@ -590,7 +587,7 @@ INSERT INTO page_views VALUES ( ``` :::note -Note: The table uses custom settings for illustration, such as one-row granules +Note: The table uses custom settings for illustration, such as one-row granules and disabled part merges, which are not recommended for production use. ::: @@ -599,25 +596,25 @@ This setup produces: - One primary index entry per row (in the base table and each projection) - Each part contains exactly one row -With this setup, we run a query filtering on both `region` and `user_id`. -Since the base table’s primary index is built from `event_date` and `id`, it -is unhelpful here, ClickHouse therefore uses: + With this setup, we run a query filtering on both `region` and `user_id`. + Since the base table’s primary index is built from `event_date` and `id`, it + is unhelpful here, ClickHouse therefore uses: - `region_proj` to prune parts by region - `user_id_proj` to further prune by `user_id` -This behavior is visible using `EXPLAIN projections = 1`, which shows how -ClickHouse selects and applies projections. + This behavior is visible using `EXPLAIN projections = 1`, which shows how + ClickHouse selects and applies projections. -```sql -EXPLAIN projections=1 -SELECT * FROM page_views WHERE region = 'us_west' AND user_id = 107; -``` + ```sql + EXPLAIN projections=1 + SELECT * FROM page_views WHERE region = 'us_west' AND user_id = 107; + ``` ```response ┌─explain────────────────────────────────────────────────────────────────────────────────┐ 1. │ Expression ((Project names + Projection)) │ - 2. │ Expression │ + 2. │ Expression │ 3. │ ReadFromMergeTree (default.page_views) │ 4. │ Projections: │ 5. │ Name: region_proj │ @@ -650,7 +647,7 @@ The `EXPLAIN` output (shown above) reveals the logical query plan, top to bottom | 14-22 | Uses user`_id_proj` to identify 1 part where `user_id = 107`, further pruning 2 of the 3 remaining parts | In the end, just **1 out of 5 parts** is read from the base table. -By combining the index analysis of multiple projections, ClickHouse significantly reduces the amount of data scanned, +By combining the index analysis of multiple projections, ClickHouse significantly reduces the amount of data scanned, improving performance while keeping storage overhead low. ## Related content {#related-content} diff --git a/docs/data-modeling/projections/2_materialized-views-versus-projections.md b/docs/data-modeling/projections/2_materialized-views-versus-projections.md index 9819dbf5278..a98564d12b6 100644 --- a/docs/data-modeling/projections/2_materialized-views-versus-projections.md +++ b/docs/data-modeling/projections/2_materialized-views-versus-projections.md @@ -6,8 +6,8 @@ hide_title: false description: 'Article comparing materialized views and projections in ClickHouse, including their use cases, performance, and limitations.' --- -> A common question from users is when they should use materialized views versus -projections. In this article we will explore the key differences between the two and why you +> A common question from users is when they should use materialized views versus +projections. In this article we will explore the key differences between the two and why you may want to pick one over the other in certain scenarios. ## Summary of key differences {#key-differences} @@ -69,20 +69,20 @@ You should consider avoiding the use of projections when: ## Summary {#summary} -Materialized views and projections are both powerful tools in your toolkit for +Materialized views and projections are both powerful tools in your toolkit for optimizing queries and transforming data, and in general, we recommend not to view -using them as an either/or choice. Instead, they can be used in a complementary +using them as an either/or choice. Instead, they can be used in a complementary manner to get the most out of your queries. As such, the choice between materialized views and projections in ClickHouse really depends on your specific use case and access patterns. As a general rule of thumb, you should consider using materialized views when you need to aggregate data from one or more source tables into a target table or -perform complex transformations at scale. Materialized views are excellent for shifting -the work of expensive aggregations from query time to insert time. They are a -great choice for daily or monthly rollups, real-time dashboards or data summaries. +perform complex transformations at scale. Materialized views are excellent for shifting +the work of expensive aggregations from query time to insert time. They are a +great choice for daily or monthly rollups, real-time dashboards or data summaries. -On the other hand, you should use projections when you need to optimize queries +On the other hand, you should use projections when you need to optimize queries which filter on different columns than those which are used in the table's primary key which determines the physical ordering of the data on disk. They are particularly useful when it's no longer possible to change the primary key of a table, or when diff --git a/docs/data-modeling/schema-design.md b/docs/data-modeling/schema-design.md index 51e8dbee58f..f712804d79b 100644 --- a/docs/data-modeling/schema-design.md +++ b/docs/data-modeling/schema-design.md @@ -25,7 +25,6 @@ The Stack Overflow dataset contains a number of related tables. In any data mode The above schema is intentionally not optimal for the purposes of this guide. - ## Establish initial schema {#establish-initial-schema} Since the `posts` table will be the target for most analytics queries, we focus on establishing a schema for this table. This data is available in the public S3 bucket `s3://datasets-documentation/stackoverflow/parquet/posts/*.parquet` with a file per year. @@ -140,87 +139,84 @@ The largest initial improvement in compression and query performance can be obta - **Use strict types** - Our initial schema used Strings for many columns which are clearly numerics. Usage of the correct types will ensure the expected semantics when filtering and aggregating. The same applies to date types, which have been correctly provided in the Parquet files. - **Avoid nullable Columns** - By default the above columns have been assumed to be Null. The Nullable type allows queries to determine the difference between an empty and Null value. This creates a separate column of UInt8 type. This additional column has to be processed every time a user works with a nullable column. This leads to additional storage space used and almost always negatively affects query performance. Only use Nullable if there is a difference between the default empty value for a type and Null. For example, a value of 0 for empty values in the `ViewCount` column will likely be sufficient for most queries and not impact results. If empty values should be treated differently, they can often also be excluded from queries with a filter. -Use the minimal precision for numeric types - ClickHouse has a number of numeric types designed for different numeric ranges and precision. Always aim to minimize the number of bits used to represent a column. As well as integers of different size e.g. Int16, ClickHouse offers unsigned variants whose minimum value is 0. These can allow fewer bits to be used for a column e.g. UInt16 has a maximum value of 65535, twice that of an Int16. Prefer these types over larger signed variants if possible. + Use the minimal precision for numeric types - ClickHouse has a number of numeric types designed for different numeric ranges and precision. Always aim to minimize the number of bits used to represent a column. As well as integers of different size e.g. Int16, ClickHouse offers unsigned variants whose minimum value is 0. These can allow fewer bits to be used for a column e.g. UInt16 has a maximum value of 65535, twice that of an Int16. Prefer these types over larger signed variants if possible. - **Minimal precision for date types** - ClickHouse supports a number of date and datetime types. Date and Date32 can be used for storing pure dates, with the latter supporting a larger date range at the expense of more bits. DateTime and DateTime64 provide support for date times. DateTime is limited to second granularity and uses 32 bits. DateTime64, as the name suggests, uses 64 bits but provides support up to nanosecond granularity. As ever, choose the more coarse version acceptable for queries, minimizing the number of bits needed. - **Use LowCardinality** - Numbers, strings, Date or DateTime columns with a low number of unique values can potentially be encoded using the LowCardinality type. This dictionary encodes values, reducing the size on disk. Consider this for columns with less than 10k unique values. -FixedString for special cases - Strings which have a fixed length can be encoded with the FixedString type e.g. language and currency codes. This is efficient when data has the length of precisely N bytes. In all other cases, it is likely to reduce efficiency and LowCardinality is preferred. + FixedString for special cases - Strings which have a fixed length can be encoded with the FixedString type e.g. language and currency codes. This is efficient when data has the length of precisely N bytes. In all other cases, it is likely to reduce efficiency and LowCardinality is preferred. - **Enums for data validation** - The Enum type can be used to efficiently encode enumerated types. Enums can either be 8 or 16 bits, depending on the number of unique values they are required to store. Consider using this if you need either the associated validation at insert time (undeclared values will be rejected) or wish to perform queries which exploit a natural ordering in the Enum values e.g. imagine a feedback column containing user responses `Enum(':(' = 1, ':|' = 2, ':)' = 3)`. -> Tip: To find the range of all columns, and the number of distinct values, users can use the simple query `SELECT * APPLY min, * APPLY max, * APPLY uniq FROM table FORMAT Vertical`. We recommend performing this over a smaller subset of the data as this can be expensive. This query requires numerics to be at least defined as such for an accurate result i.e. not a String. - -By applying these simple rules to our posts table, we can identify an optimal type for each column: - - -| Column | Is Numeric | Min, Max | Unique Values | Nulls | Comment | Optimized Type | -|------------------------|------------|------------------------------------------------------------------------|----------------|--------|----------------------------------------------------------------------------------------------|------------------------------------------| -| `PostTypeId` | Yes | 1, 8 | 8 | No | | `Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, 'WikiPlaceholder' = 7, 'PrivilegeWiki' = 8)` | -| `AcceptedAnswerId` | Yes | 0, 78285170 | 12282094 | Yes | Differentiate Null with 0 value | UInt32 | -| `CreationDate` | No | 2008-07-31 21:42:52.667000000, 2024-03-31 23:59:17.697000000 | - | No | Millisecond granularity is not required, use DateTime | DateTime | -| `Score` | Yes | -217, 34970 | 3236 | No | | Int32 | -| `ViewCount` | Yes | 2, 13962748 | 170867 | No | | UInt32 | -| `Body` | No | - | - | No | | String | -| `OwnerUserId` | Yes | -1, 4056915 | 6256237 | Yes | | Int32 | -| `OwnerDisplayName` | No | - | 181251 | Yes | Consider Null to be empty string | String | -| `LastEditorUserId` | Yes | -1, 9999993 | 1104694 | Yes | 0 is an unused value can be used for Nulls | Int32 | -| `LastEditorDisplayName` | No | - | 70952 | Yes | Consider Null to be an empty string. Tested LowCardinality and no benefit | String | -| `LastEditDate` | No | 2008-08-01 13:24:35.051000000, 2024-04-06 21:01:22.697000000 | - | No | Millisecond granularity is not required, use DateTime | DateTime | -| `LastActivityDate` | No | 2008-08-01 12:19:17.417000000, 2024-04-06 21:01:22.697000000 | - | No | Millisecond granularity is not required, use DateTime | DateTime | -| `Title` | No | - | - | No | Consider Null to be an empty string | String | -| `Tags` | No | - | - | No | Consider Null to be an empty string | String | -| `AnswerCount` | Yes | 0, 518 | 216 | No | Consider Null and 0 to same | UInt16 | -| `CommentCount` | Yes | 0, 135 | 100 | No | Consider Null and 0 to same | UInt8 | -| `FavoriteCount` | Yes | 0, 225 | 6 | Yes | Consider Null and 0 to same | UInt8 | -| `ContentLicense` | No | - | 3 | No | LowCardinality outperforms FixedString | LowCardinality(String) | -| `ParentId` | No | - | 20696028 | Yes | Consider Null to be an empty string | String | -| `CommunityOwnedDate` | No | 2008-08-12 04:59:35.017000000, 2024-04-01 05:36:41.380000000 | - | Yes | Consider default 1970-01-01 for Nulls. Millisecond granularity is not required, use DateTime | DateTime | -| `ClosedDate` | No | 2008-09-04 20:56:44, 2024-04-06 18:49:25.393000000 | - | Yes | Consider default 1970-01-01 for Nulls. Millisecond granularity is not required, use DateTime | DateTime | - -
- -The above gives us the following schema: - -```sql -CREATE TABLE posts_v2 -( - `Id` Int32, - `PostTypeId` Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, 'WikiPlaceholder' = 7, 'PrivilegeWiki' = 8), - `AcceptedAnswerId` UInt32, - `CreationDate` DateTime, - `Score` Int32, - `ViewCount` UInt32, - `Body` String, - `OwnerUserId` Int32, - `OwnerDisplayName` String, - `LastEditorUserId` Int32, - `LastEditorDisplayName` String, - `LastEditDate` DateTime, - `LastActivityDate` DateTime, - `Title` String, - `Tags` String, - `AnswerCount` UInt16, - `CommentCount` UInt8, - `FavoriteCount` UInt8, - `ContentLicense`LowCardinality(String), - `ParentId` String, - `CommunityOwnedDate` DateTime, - `ClosedDate` DateTime -) -ENGINE = MergeTree -ORDER BY tuple() -COMMENT 'Optimized types' -``` - -We can populate this with a simple `INSERT INTO SELECT`, reading the data from our previous table and inserting into this one: - -```sql -INSERT INTO posts_v2 SELECT * FROM posts - -0 rows in set. Elapsed: 146.471 sec. Processed 59.82 million rows, 83.82 GB (408.40 thousand rows/s., 572.25 MB/s.) -``` - -We don't retain any nulls in our new schema. The above insert converts these implicitly to default values for their respective types - 0 for integers and an empty value for strings. ClickHouse also automatically converts any numerics to their target precision. -Primary (Ordering) Keys in ClickHouse -Users coming from OLTP databases often look for the equivalent concept in ClickHouse. + > Tip: To find the range of all columns, and the number of distinct values, users can use the simple query `SELECT * APPLY min, * APPLY max, * APPLY uniq FROM table FORMAT Vertical`. We recommend performing this over a smaller subset of the data as this can be expensive. This query requires numerics to be at least defined as such for an accurate result i.e. not a String. + + By applying these simple rules to our posts table, we can identify an optimal type for each column: + + | Column | Is Numeric | Min, Max | Unique Values | Nulls | Comment | Optimized Type | + |------------------------|------------|------------------------------------------------------------------------|----------------|--------|----------------------------------------------------------------------------------------------|------------------------------------------| + | `PostTypeId` | Yes | 1, 8 | 8 | No | | `Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, 'WikiPlaceholder' = 7, 'PrivilegeWiki' = 8)` | + | `AcceptedAnswerId` | Yes | 0, 78285170 | 12282094 | Yes | Differentiate Null with 0 value | UInt32 | + | `CreationDate` | No | 2008-07-31 21:42:52.667000000, 2024-03-31 23:59:17.697000000 | - | No | Millisecond granularity is not required, use DateTime | DateTime | + | `Score` | Yes | -217, 34970 | 3236 | No | | Int32 | + | `ViewCount` | Yes | 2, 13962748 | 170867 | No | | UInt32 | + | `Body` | No | - | - | No | | String | + | `OwnerUserId` | Yes | -1, 4056915 | 6256237 | Yes | | Int32 | + | `OwnerDisplayName` | No | - | 181251 | Yes | Consider Null to be empty string | String | + | `LastEditorUserId` | Yes | -1, 9999993 | 1104694 | Yes | 0 is an unused value can be used for Nulls | Int32 | + | `LastEditorDisplayName` | No | - | 70952 | Yes | Consider Null to be an empty string. Tested LowCardinality and no benefit | String | + | `LastEditDate` | No | 2008-08-01 13:24:35.051000000, 2024-04-06 21:01:22.697000000 | - | No | Millisecond granularity is not required, use DateTime | DateTime | + | `LastActivityDate` | No | 2008-08-01 12:19:17.417000000, 2024-04-06 21:01:22.697000000 | - | No | Millisecond granularity is not required, use DateTime | DateTime | + | `Title` | No | - | - | No | Consider Null to be an empty string | String | + | `Tags` | No | - | - | No | Consider Null to be an empty string | String | + | `AnswerCount` | Yes | 0, 518 | 216 | No | Consider Null and 0 to same | UInt16 | + | `CommentCount` | Yes | 0, 135 | 100 | No | Consider Null and 0 to same | UInt8 | + | `FavoriteCount` | Yes | 0, 225 | 6 | Yes | Consider Null and 0 to same | UInt8 | + | `ContentLicense` | No | - | 3 | No | LowCardinality outperforms FixedString | LowCardinality(String) | + | `ParentId` | No | - | 20696028 | Yes | Consider Null to be an empty string | String | + | `CommunityOwnedDate` | No | 2008-08-12 04:59:35.017000000, 2024-04-01 05:36:41.380000000 | - | Yes | Consider default 1970-01-01 for Nulls. Millisecond granularity is not required, use DateTime | DateTime | + | `ClosedDate` | No | 2008-09-04 20:56:44, 2024-04-06 18:49:25.393000000 | - | Yes | Consider default 1970-01-01 for Nulls. Millisecond granularity is not required, use DateTime | DateTime | + + The above gives us the following schema: + + ```sql + CREATE TABLE posts_v2 + ( + `Id` Int32, + `PostTypeId` Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, 'WikiPlaceholder' = 7, 'PrivilegeWiki' = 8), + `AcceptedAnswerId` UInt32, + `CreationDate` DateTime, + `Score` Int32, + `ViewCount` UInt32, + `Body` String, + `OwnerUserId` Int32, + `OwnerDisplayName` String, + `LastEditorUserId` Int32, + `LastEditorDisplayName` String, + `LastEditDate` DateTime, + `LastActivityDate` DateTime, + `Title` String, + `Tags` String, + `AnswerCount` UInt16, + `CommentCount` UInt8, + `FavoriteCount` UInt8, + `ContentLicense`LowCardinality(String), + `ParentId` String, + `CommunityOwnedDate` DateTime, + `ClosedDate` DateTime + ) + ENGINE = MergeTree + ORDER BY tuple() + COMMENT 'Optimized types' + ``` + + We can populate this with a simple `INSERT INTO SELECT`, reading the data from our previous table and inserting into this one: + + ```sql + INSERT INTO posts_v2 SELECT * FROM posts + + 0 rows in set. Elapsed: 146.471 sec. Processed 59.82 million rows, 83.82 GB (408.40 thousand rows/s., 572.25 MB/s.) + ``` + + We don't retain any nulls in our new schema. The above insert converts these implicitly to default values for their respective types - 0 for integers and an empty value for strings. ClickHouse also automatically converts any numerics to their target precision. + Primary (Ordering) Keys in ClickHouse + Users coming from OLTP databases often look for the equivalent concept in ClickHouse. ## Choosing an ordering key {#choosing-an-ordering-key} @@ -235,11 +231,11 @@ The selected key in ClickHouse will determine not only the index, but also order Some simple rules can be applied to help choose an ordering key. The following can sometimes be in conflict, so consider these in order. Users can identify a number of keys from this process, with 4-5 typically sufficient: - Select columns which align with your common filters. If a column is used frequently in `WHERE` clauses, prioritize including these in your key over those which are used less frequently. -Prefer columns which help exclude a large percentage of the total rows when filtered, thus reducing the amount of data which needs to be read. + Prefer columns which help exclude a large percentage of the total rows when filtered, thus reducing the amount of data which needs to be read. - Prefer columns which are likely to be highly correlated with other columns in the table. This will help ensure these values are also stored contiguously, improving compression. -`GROUP BY` and `ORDER BY` operations for columns in the ordering key can be made more memory efficient. + `GROUP BY` and `ORDER BY` operations for columns in the ordering key can be made more memory efficient. -When identifying the subset of columns for the ordering key, declare the columns in a specific order. This order can significantly influence both the efficiency of the filtering on secondary key columns in queries, and the compression ratio for the table's data files. In general, it is best to order the keys in ascending order of cardinality. This should be balanced against the fact that filtering on columns that appear later in the ordering key will be less efficient than filtering on those that appear earlier in the tuple. Balance these behaviors and consider your access patterns (and most importantly test variants). + When identifying the subset of columns for the ordering key, declare the columns in a specific order. This order can significantly influence both the efficiency of the filtering on secondary key columns in queries, and the compression ratio for the table's data files. In general, it is best to order the keys in ascending order of cardinality. This should be balanced against the fact that filtering on columns that appear later in the ordering key will be less efficient than filtering on those that appear earlier in the tuple. Balance these behaviors and consider your access patterns (and most importantly test variants). ### Example {#example} @@ -312,7 +308,6 @@ INSERT INTO posts_v3 SELECT * FROM posts_v2 0 rows in set. Elapsed: 158.074 sec. Processed 59.82 million rows, 76.21 GB (378.42 thousand rows/s., 482.14 MB/s.) Peak memory usage: 6.41 GiB. - Our previous query improves the query response time by over 3x: SELECT @@ -348,4 +343,4 @@ In order to minimize the use of Joins at query time, users have several tools/ap - [**Incremental Materialized Views**](/materialized-view/incremental-materialized-view) - A ClickHouse feature for shifting the cost of a computation from query time to insert time, including the ability to incrementally compute aggregate values. - [**Refreshable Materialized Views**](/materialized-view/refreshable-materialized-view) - Similar to materialized views used in other database products, this allows the results of a query to be periodically computed and the result cached. -We explore each of these approaches in each guide, highlighting when each is appropriate with an example showing how it can be applied to solving questions for the Stack Overflow dataset. + We explore each of these approaches in each guide, highlighting when each is appropriate with an example showing how it can be applied to solving questions for the Stack Overflow dataset. diff --git a/docs/deployment-guides/horizontal-scaling.md b/docs/deployment-guides/horizontal-scaling.md index 1aac487a345..71efae3a5a0 100644 --- a/docs/deployment-guides/horizontal-scaling.md +++ b/docs/deployment-guides/horizontal-scaling.md @@ -136,9 +136,9 @@ Starting from the top: - The cluster `cluster_2S_1R` has two shards, and each of those shards has one replica. Take a look at the architecture diagram toward the beginning of this document, and compare it with the two `shard` definitions in the XML below. In each of the shard definitions there is one replica. The replica is for that specific shard. The host and port for that replica is specified. The replica for the first shard in the configuration is stored on `chnode1`, and the replica for the second shard in the configuration is stored on `chnode2`. - Internal replication for the shards is set to true. Each shard can have the `internal_replication` parameter defined in the config file. If this parameter is set to true, the write operation selects the first healthy replica and writes data to it. -```xml title="remote-servers.xml on chnode1" - - + ```xml title="remote-servers.xml on chnode1" + + mysecretphrase @@ -156,9 +156,9 @@ Starting from the top: - - -``` + + + ``` ### Configuring the use of Keeper {#configuring-the-use-of-keeper} @@ -376,110 +376,108 @@ As `chnode3` is not storing data and is only used for ClickHouse Keeper to provi 1. Connect to `chnode1` and verify that the cluster `cluster_2S_1R` configured above exists -```sql title="Query" -SHOW CLUSTERS -``` + ```sql title="Query" + SHOW CLUSTERS + ``` -```response title="Response" -┌─cluster───────┐ -│ cluster_2S_1R │ -└───────────────┘ -``` + ```response title="Response" + ┌─cluster───────┐ + │ cluster_2S_1R │ + └───────────────┘ + ``` 2. Create a database on the cluster -```sql title="Query" -CREATE DATABASE db1 ON CLUSTER cluster_2S_1R -``` + ```sql title="Query" + CREATE DATABASE db1 ON CLUSTER cluster_2S_1R + ``` -```response title="Response" -┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ -│ chnode2 │ 9000 │ 0 │ │ 1 │ 0 │ -│ chnode1 │ 9000 │ 0 │ │ 0 │ 0 │ -└─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ -``` + ```response title="Response" + ┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ + │ chnode2 │ 9000 │ 0 │ │ 1 │ 0 │ + │ chnode1 │ 9000 │ 0 │ │ 0 │ 0 │ + └─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ + ``` 3. Create a table with MergeTree table engine on the cluster. -:::note -We do not need not to specify parameters on the table engine since these will be automatically defined based on our macros -::: + :::note + We do not need not to specify parameters on the table engine since these will be automatically defined based on our macros + ::: -```sql title="Query" -CREATE TABLE db1.table1 ON CLUSTER cluster_2S_1R -( + ```sql title="Query" + CREATE TABLE db1.table1 ON CLUSTER cluster_2S_1R + ( `id` UInt64, `column1` String -) -ENGINE = MergeTree -ORDER BY id -``` -```response title="Response" -┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ -│ chnode1 │ 9000 │ 0 │ │ 1 │ 0 │ -│ chnode2 │ 9000 │ 0 │ │ 0 │ 0 │ -└─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ -``` + ) + ENGINE = MergeTree + ORDER BY id + ``` + ```response title="Response" + ┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ + │ chnode1 │ 9000 │ 0 │ │ 1 │ 0 │ + │ chnode2 │ 9000 │ 0 │ │ 0 │ 0 │ + └─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ + ``` 4. Connect to `chnode1` and insert a row -```sql title="Query" -INSERT INTO db1.table1 (id, column1) VALUES (1, 'abc'); -``` + ```sql title="Query" + INSERT INTO db1.table1 (id, column1) VALUES (1, 'abc'); + ``` 5. Connect to `chnode2` and insert a row -```sql title="Query" -INSERT INTO db1.table1 (id, column1) VALUES (2, 'def'); -``` + ```sql title="Query" + INSERT INTO db1.table1 (id, column1) VALUES (2, 'def'); + ``` 6. Connect to either node, `chnode1` or `chnode2` and you will see only the row that was inserted into that table on that node. -for example, on `chnode2` + for example, on `chnode2` -```sql title="Query" -SELECT * FROM db1.table1; -``` - -```response title="Response" -┌─id─┬─column1─┐ -│ 2 │ def │ -└────┴─────────┘ -``` + ```sql title="Query" + SELECT * FROM db1.table1; + ``` + ```response title="Response" + ┌─id─┬─column1─┐ + │ 2 │ def │ + └────┴─────────┘ + ``` 7. Create a distributed table to query both shards on both nodes. -(In this example, the `rand()` function is set as the sharding key so that it randomly distributes each insert) + (In this example, the `rand()` function is set as the sharding key so that it randomly distributes each insert) -```sql title="Query" -CREATE TABLE db1.table1_dist ON CLUSTER cluster_2S_1R -( + ```sql title="Query" + CREATE TABLE db1.table1_dist ON CLUSTER cluster_2S_1R + ( `id` UInt64, `column1` String -) -ENGINE = Distributed('cluster_2S_1R', 'db1', 'table1', rand()) -``` + ) + ENGINE = Distributed('cluster_2S_1R', 'db1', 'table1', rand()) + ``` -```response title="Response" -┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ -│ chnode2 │ 9000 │ 0 │ │ 1 │ 0 │ -│ chnode1 │ 9000 │ 0 │ │ 0 │ 0 │ -└─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ -``` + ```response title="Response" + ┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ + │ chnode2 │ 9000 │ 0 │ │ 1 │ 0 │ + │ chnode1 │ 9000 │ 0 │ │ 0 │ 0 │ + └─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ + ``` 8. Connect to either `chnode1` or `chnode2` and query the distributed table to see both rows. -```sql title="Query" -SELECT * FROM db1.table1_dist; -``` - -```reponse title="Response" -┌─id─┬─column1─┐ -│ 2 │ def │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 1 │ abc │ -└────┴─────────┘ -``` - + ```sql title="Query" + SELECT * FROM db1.table1_dist; + ``` + + ```reponse title="Response" + ┌─id─┬─column1─┐ + │ 2 │ def │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 1 │ abc │ + └────┴─────────┘ + ``` ## More information about {#more-information-about} diff --git a/docs/deployment-guides/parallel-replicas.mdx b/docs/deployment-guides/parallel-replicas.mdx index 4fff9cf815d..0d9b61b6449 100644 --- a/docs/deployment-guides/parallel-replicas.mdx +++ b/docs/deployment-guides/parallel-replicas.mdx @@ -21,11 +21,11 @@ import image_9 from '@site/static/images/deployment-guides/parallel-replicas-9.p ## Introduction {#introduction} -ClickHouse processes queries extremely quickly, but how are these queries -distributed and parallelized across multiple servers? +ClickHouse processes queries extremely quickly, but how are these queries +distributed and parallelized across multiple servers? > In this guide, we will first discuss how ClickHouse distributes a query across -multiple shards via distributed tables, and then how a query can leverage +multiple shards via distributed tables, and then how a query can leverage multiple replicas for its execution. ## Sharded architecture {#sharded-architecture} @@ -46,8 +46,8 @@ The figure above visualizes what happens when a client queries a distributed tab
  1. - The select query is sent to a distributed table on a node arbitrarily - (via a round-robin strategy or after being routed to a specific server + The select query is sent to a distributed table on a node arbitrarily + (via a round-robin strategy or after being routed to a specific server by a load balancer). This node is now going to act as a coordinator.
  2. @@ -119,7 +119,7 @@ With parallel replicas: granules to process.
  3. - The coordinator splits the workload into a set of granules that can be + The coordinator splits the workload into a set of granules that can be assigned to different replicas.
  4. @@ -148,8 +148,8 @@ from working perfectly: Tail latency between replicas needs to be handled somehow.
  5. - The filesystem cache varies from replica to replica based on the - activity on each replica, meaning that a random task assignment might + The filesystem cache varies from replica to replica based on the + activity on each replica, meaning that a random task assignment might lead to less optimal performance given the cache locality.
@@ -328,7 +328,7 @@ the label `comp-parallel-replicas`. ## Investigating issues with parallel replicas {#investigating-issues-with-parallel-replicas} -You can check what settings are being used for each query in the +You can check what settings are being used for each query in the [`system.query_log`](/docs/operations/system-tables/query_log) table. You can also look at the [`system.events`](/docs/operations/system-tables/events) table to see all the events that have occured on the server, and you can use the @@ -339,9 +339,9 @@ table to see all the events that have occured on the server, and you can use the SELECT hostname(), * -FROM clusterAllReplicas('default', system.events) -WHERE event ILIKE '%ParallelReplicas%' -``` + FROM clusterAllReplicas('default', system.events) + WHERE event ILIKE '%ParallelReplicas%' + ```
Response ```response title="Response" @@ -392,7 +392,7 @@ WHERE event ILIKE '%ParallelReplicas%' ```
-The [`system.text_log`](/docs/operations/system-tables/text_log) table also +The [`system.text_log`](/docs/operations/system-tables/text_log) table also contains information about the execution of queries using parallel replicas: ```sql title="Query" @@ -419,34 +419,34 @@ ORDER BY event_time_microseconds ASC │ The number of replicas requested (100) is bigger than the real number available in the cluster (6). Will use the latter number to execute the query. │ │ Initial request from replica 4: 2 parts: [part all_0_2_1 with ranges [(0, 182)], part all_3_3_0 with ranges [(0, 62)]]---------- Received from 4 replica - │ +│ │ Reading state is fully initialized: part all_0_2_1 with ranges [(0, 182)] in replicas [4]; part all_3_3_0 with ranges [(0, 62)] in replicas [4] │ │ Sent initial requests: 1 Replicas count: 6 │ │ Initial request from replica 2: 2 parts: [part all_0_2_1 with ranges [(0, 182)], part all_3_3_0 with ranges [(0, 62)]]---------- Received from 2 replica - │ +│ │ Sent initial requests: 2 Replicas count: 6 │ │ Handling request from replica 4, minimal marks size is 240 │ │ Going to respond to replica 4 with 1 parts: [part all_0_2_1 with ranges [(128, 182)]]. Finish: false; mine_marks=0, stolen_by_hash=54, stolen_rest=0 │ │ Initial request from replica 1: 2 parts: [part all_0_2_1 with ranges [(0, 182)], part all_3_3_0 with ranges [(0, 62)]]---------- Received from 1 replica - │ +│ │ Sent initial requests: 3 Replicas count: 6 │ │ Handling request from replica 4, minimal marks size is 240 │ │ Going to respond to replica 4 with 2 parts: [part all_0_2_1 with ranges [(0, 128)], part all_3_3_0 with ranges [(0, 62)]]. Finish: false; mine_marks=0, stolen_by_hash=0, stolen_rest=190 │ │ Initial request from replica 0: 2 parts: [part all_0_2_1 with ranges [(0, 182)], part all_3_3_0 with ranges [(0, 62)]]---------- Received from 0 replica - │ +│ │ Sent initial requests: 4 Replicas count: 6 │ │ Initial request from replica 5: 2 parts: [part all_0_2_1 with ranges [(0, 182)], part all_3_3_0 with ranges [(0, 62)]]---------- Received from 5 replica - │ +│ │ Sent initial requests: 5 Replicas count: 6 │ │ Handling request from replica 2, minimal marks size is 240 │ │ Going to respond to replica 2 with 0 parts: []. Finish: true; mine_marks=0, stolen_by_hash=0, stolen_rest=0 │ │ Initial request from replica 3: 2 parts: [part all_0_2_1 with ranges [(0, 182)], part all_3_3_0 with ranges [(0, 62)]]---------- Received from 3 replica - │ +│ │ Sent initial requests: 6 Replicas count: 6 │ │ Total rows to read: 2000000 │ │ Handling request from replica 5, minimal marks size is 240 │ @@ -466,27 +466,27 @@ Received from 3 replica ```
-Finally, you can also use the `EXPLAIN PIPELINE`. It highlights how ClickHouse -is going to execute a query and what resources are going to be used for the +Finally, you can also use the `EXPLAIN PIPELINE`. It highlights how ClickHouse +is going to execute a query and what resources are going to be used for the execution of the query. Let's take the following query for example: ```sql -SELECT count(), uniq(pageId) , min(timestamp), max(timestamp) -FROM session_events -WHERE type='type3' +SELECT count(), uniq(pageId) , min(timestamp), max(timestamp) +FROM session_events +WHERE type='type3' GROUP BY toYear(timestamp) LIMIT 10 ``` Let's have a look at the query pipeline without parallel replica: ```sql title="EXPLAIN PIPELINE (without parallel replica)" -EXPLAIN PIPELINE graph = 1, compact = 0 -SELECT count(), uniq(pageId) , min(timestamp), max(timestamp) -FROM session_events -WHERE type='type3' -GROUP BY toYear(timestamp) -LIMIT 10 -SETTINGS allow_experimental_parallel_reading_from_replicas=0 +EXPLAIN PIPELINE graph = 1, compact = 0 +SELECT count(), uniq(pageId) , min(timestamp), max(timestamp) +FROM session_events +WHERE type='type3' +GROUP BY toYear(timestamp) +LIMIT 10 +SETTINGS allow_experimental_parallel_reading_from_replicas=0 FORMAT TSV; ``` @@ -495,13 +495,13 @@ FORMAT TSV; And now with parallel replica: ```sql title="EXPLAIN PIPELINE (with parallel replica)" -EXPLAIN PIPELINE graph = 1, compact = 0 -SELECT count(), uniq(pageId) , min(timestamp), max(timestamp) -FROM session_events -WHERE type='type3' -GROUP BY toYear(timestamp) -LIMIT 10 -SETTINGS allow_experimental_parallel_reading_from_replicas=2 +EXPLAIN PIPELINE graph = 1, compact = 0 +SELECT count(), uniq(pageId) , min(timestamp), max(timestamp) +FROM session_events +WHERE type='type3' +GROUP BY toYear(timestamp) +LIMIT 10 +SETTINGS allow_experimental_parallel_reading_from_replicas=2 FORMAT TSV; ``` diff --git a/docs/deployment-guides/replicated.md b/docs/deployment-guides/replicated.md index 3821ed80118..a820dc1085c 100644 --- a/docs/deployment-guides/replicated.md +++ b/docs/deployment-guides/replicated.md @@ -57,8 +57,8 @@ These values can be customized as you wish. This example configuration gives yo - the name displayed when you connect with `clickhouse-client` is `cluster_1S_2R node 1` - ClickHouse will listen on the IPV4 network on ports 8123 and 9000. -```xml title="/etc/clickhouse-server/config.d/network-and-logging.xml on clickhouse-01" - + ```xml title="/etc/clickhouse-server/config.d/network-and-logging.xml on clickhouse-01" + debug /var/log/clickhouse-server/clickhouse-server.log @@ -70,8 +70,8 @@ These values can be customized as you wish. This example configuration gives yo 0.0.0.0 8123 9000 - -``` + + ``` ### Macros configuration {#macros-configuration} @@ -98,8 +98,8 @@ Starting from the top: - The cluster `cluster_1S_2R` has one shard, and two replicas. Take a look at the architecture diagram toward the beginning of this document, and compare it with the `shard` definition in the XML below. The shard definition contains two replicas. The host and port for each replica is specified. One replica is stored on `clickhouse-01`, and the other replica is stored on `clickhouse-02`. - Internal replication for the shard is set to true. Each shard can have the internal_replication parameter defined in the config file. If this parameter is set to true, the write operation selects the first healthy replica and writes data to it. -```xml title="/etc/clickhouse-server/config.d/remote-servers.xml on clickhouse-01" - + ```xml title="/etc/clickhouse-server/config.d/remote-servers.xml on clickhouse-01" + mysecretphrase @@ -116,8 +116,8 @@ Starting from the top: - -``` + + ``` ### Configuring the use of Keeper {#configuring-the-use-of-keeper} @@ -456,105 +456,105 @@ Connect to node `clickhouse-01` with `clickhouse client` in one shell, and conne 1. Create a database on the cluster configured above -```sql title="run on either node clickhouse-01 or clickhouse-02" -CREATE DATABASE db1 ON CLUSTER cluster_1S_2R -``` -```response -┌─host──────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ -│ clickhouse-02 │ 9000 │ 0 │ │ 1 │ 0 │ -│ clickhouse-01 │ 9000 │ 0 │ │ 0 │ 0 │ -└───────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ -``` + ```sql title="run on either node clickhouse-01 or clickhouse-02" + CREATE DATABASE db1 ON CLUSTER cluster_1S_2R + ``` + ```response + ┌─host──────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ + │ clickhouse-02 │ 9000 │ 0 │ │ 1 │ 0 │ + │ clickhouse-01 │ 9000 │ 0 │ │ 0 │ 0 │ + └───────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ + ``` 2. Create a table on the database using the ReplicatedMergeTree table engine -```sql title="run on either node clickhouse-01 or clickhouse-02" -CREATE TABLE db1.table1 ON CLUSTER cluster_1S_2R -( + ```sql title="run on either node clickhouse-01 or clickhouse-02" + CREATE TABLE db1.table1 ON CLUSTER cluster_1S_2R + ( `id` UInt64, `column1` String -) -ENGINE = ReplicatedMergeTree -ORDER BY id -``` -```response -┌─host──────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ -│ clickhouse-02 │ 9000 │ 0 │ │ 1 │ 0 │ -│ clickhouse-01 │ 9000 │ 0 │ │ 0 │ 0 │ -└───────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ -``` + ) + ENGINE = ReplicatedMergeTree + ORDER BY id + ``` + ```response + ┌─host──────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ + │ clickhouse-02 │ 9000 │ 0 │ │ 1 │ 0 │ + │ clickhouse-01 │ 9000 │ 0 │ │ 0 │ 0 │ + └───────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ + ``` 3. Insert data on one node and query it on another node -```sql title="run on node clickhouse-01" -INSERT INTO db1.table1 (id, column1) VALUES (1, 'abc'); -``` + ```sql title="run on node clickhouse-01" + INSERT INTO db1.table1 (id, column1) VALUES (1, 'abc'); + ``` 4. Query the table on the node `clickhouse-02` -```sql title="run on node clickhouse-02" -SELECT * -FROM db1.table1 -``` -```response -┌─id─┬─column1─┐ -│ 1 │ abc │ -└────┴─────────┘ -``` + ```sql title="run on node clickhouse-02" + SELECT * + FROM db1.table1 + ``` + ```response + ┌─id─┬─column1─┐ + │ 1 │ abc │ + └────┴─────────┘ + ``` 5. Insert data on the other node and query it on the node `clickhouse-01` -```sql title="run on node clickhouse-02" -INSERT INTO db1.table1 (id, column1) VALUES (2, 'def'); -``` - -```sql title="run on node clickhouse-01" -SELECT * -FROM db1.table1 -``` -```response -┌─id─┬─column1─┐ -│ 1 │ abc │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 2 │ def │ -└────┴─────────┘ -``` + ```sql title="run on node clickhouse-02" + INSERT INTO db1.table1 (id, column1) VALUES (2, 'def'); + ``` + + ```sql title="run on node clickhouse-01" + SELECT * + FROM db1.table1 + ``` + ```response + ┌─id─┬─column1─┐ + │ 1 │ abc │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 2 │ def │ + └────┴─────────┘ + ``` 6. Stop one ClickHouse server node -Stop one of the ClickHouse server nodes by running an operating system command similar to the command used to start the node. If you used `systemctl start` to start the node, then use `systemctl stop` to stop it. + Stop one of the ClickHouse server nodes by running an operating system command similar to the command used to start the node. If you used `systemctl start` to start the node, then use `systemctl stop` to stop it. 7. Insert more data on the running node -```sql title="run on the running node" -INSERT INTO db1.table1 (id, column1) VALUES (3, 'ghi'); -``` - -Select the data: -```sql title="run on the running node" -SELECT * -FROM db1.table1 -``` -```response -┌─id─┬─column1─┐ -│ 1 │ abc │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 2 │ def │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 3 │ ghi │ -└────┴─────────┘ -``` + ```sql title="run on the running node" + INSERT INTO db1.table1 (id, column1) VALUES (3, 'ghi'); + ``` + + Select the data: + ```sql title="run on the running node" + SELECT * + FROM db1.table1 + ``` + ```response + ┌─id─┬─column1─┐ + │ 1 │ abc │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 2 │ def │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 3 │ ghi │ + └────┴─────────┘ + ``` 8. Restart the stopped node and select from there also -```sql title="run on the restarted node" -SELECT * -FROM db1.table1 -``` -```response -┌─id─┬─column1─┐ -│ 1 │ abc │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 2 │ def │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 3 │ ghi │ -└────┴─────────┘ -``` + ```sql title="run on the restarted node" + SELECT * + FROM db1.table1 + ``` + ```response + ┌─id─┬─column1─┐ + │ 1 │ abc │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 2 │ def │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 3 │ ghi │ + └────┴─────────┘ + ``` diff --git a/docs/deployment-guides/terminology.md b/docs/deployment-guides/terminology.md index de347e53a26..4c9bf827c14 100644 --- a/docs/deployment-guides/terminology.md +++ b/docs/deployment-guides/terminology.md @@ -20,8 +20,8 @@ These deployment examples are based on the advice provided to ClickHouse users b - The [**Replication for fault tolerance**](/deployment-guides/replicated.md) example shows how to replicate your data across two nodes, and use a ReplicatedMergeTree table. This results in having data on two ClickHouse nodes. In addition to the two ClickHouse server nodes there are three ClickHouse Keeper standalone nodes to manage replication. -
- -
+ + ### Intermediate {#intermediate} diff --git a/docs/deployment-modes.md b/docs/deployment-modes.md index ca452cbfb2b..92fd087f917 100644 --- a/docs/deployment-modes.md +++ b/docs/deployment-modes.md @@ -23,9 +23,9 @@ This guide explores the four main ways to deploy and use ClickHouse: * clickhouse-local for command-line data processing * chDB for embedding ClickHouse directly in applications -Each deployment mode has its own strengths and ideal use cases, which we'll explore in detail below. + Each deployment mode has its own strengths and ideal use cases, which we'll explore in detail below. - + ## ClickHouse Server {#clickhouse-server} diff --git a/docs/dictionary/index.md b/docs/dictionary/index.md index 5bf2bb9bb5b..8384b2eaa4b 100644 --- a/docs/dictionary/index.md +++ b/docs/dictionary/index.md @@ -17,7 +17,7 @@ Dictionaries are useful for: - Improving the performance of queries, especially when used with `JOIN`s - Enriching ingested data on the fly without slowing down the ingestion process -Use cases for Dictionary in ClickHouse + Use cases for Dictionary in ClickHouse ## Speeding up joins using a Dictionary {#speeding-up-joins-using-a-dictionary} @@ -155,7 +155,6 @@ SELECT dictGet('votes_dict', ('UpVotes', 'DownVotes'), '11227902') AS votes │ (34999,32) │ └────────────┘ - Exploiting this in our earlier query, we can remove the JOIN: WITH PostIds AS @@ -207,8 +206,6 @@ WHERE Title ILIKE '%clickhouse%' LIMIT 5 FORMAT PrettyCompactMonoBlock - - ┌───────Id─┬─Title─────────────────────────────────────────────────────────┬─Location──────────────┐ │ 52296928 │ Comparision between two Strings in ClickHouse │ Spain │ │ 52345137 │ How to use a file to migrate data from mysql to a clickhouse? │ 中国江苏省Nanjing Shi │ diff --git a/docs/faq/general/columnar-database.md b/docs/faq/general/columnar-database.md index 41f8c69497f..b1d196bcf49 100644 --- a/docs/faq/general/columnar-database.md +++ b/docs/faq/general/columnar-database.md @@ -20,14 +20,14 @@ Key columnar database advantages are: - Aggregating queries against large volumes of data. - Column-wise data compression. -Here is the illustration of the difference between traditional row-oriented systems and columnar databases when building reports: + Here is the illustration of the difference between traditional row-oriented systems and columnar databases when building reports: -**Traditional row-oriented** -Traditional row-oriented database + **Traditional row-oriented** + Traditional row-oriented database -**Columnar** -Columnar database + **Columnar** + Columnar database -A columnar database is the preferred choice for analytical applications because it allows having many columns in a table just in case, but to not pay the cost for unused columns on read query execution time (a traditional OLTP database reads all of the data during queries as the data is stored in rows and not columns). Column-oriented databases are designed for big data processing and data warehousing, they often natively scale using distributed clusters of low-cost hardware to increase throughput. ClickHouse does it with combination of [distributed](../../engines/table-engines/special/distributed.md) and [replicated](../../engines/table-engines/mergetree-family/replication.md) tables. + A columnar database is the preferred choice for analytical applications because it allows having many columns in a table just in case, but to not pay the cost for unused columns on read query execution time (a traditional OLTP database reads all of the data during queries as the data is stored in rows and not columns). Column-oriented databases are designed for big data processing and data warehousing, they often natively scale using distributed clusters of low-cost hardware to increase throughput. ClickHouse does it with combination of [distributed](../../engines/table-engines/special/distributed.md) and [replicated](../../engines/table-engines/mergetree-family/replication.md) tables. -If you'd like a deep dive into the history of column databases, how they differ from row-oriented databases, and the use cases for a column database, see [the column databases guide](https://clickhouse.com/engineering-resources/what-is-columnar-database). + If you'd like a deep dive into the history of column databases, how they differ from row-oriented databases, and the use cases for a column database, see [the column databases guide](https://clickhouse.com/engineering-resources/what-is-columnar-database). diff --git a/docs/faq/general/dbms-naming.md b/docs/faq/general/dbms-naming.md index 5c54e43fe07..03eae41eb62 100644 --- a/docs/faq/general/dbms-naming.md +++ b/docs/faq/general/dbms-naming.md @@ -15,6 +15,6 @@ This two-part meaning has two consequences: - The only correct way to write Click**H**ouse is with capital H. - If you need to abbreviate it, use **CH**. For some historical reasons, abbreviating as CK is also popular in China, mostly because one of the first talks about ClickHouse in Chinese used this form. -:::info -Many years after ClickHouse got its name, this approach of combining two words that are meaningful on their own has been highlighted as the best way to name a database in a [research by Andy Pavlo](https://www.cs.cmu.edu/~pavlo/blog/2020/03/on-naming-a-database-management-system.html), an Associate Professor of Databases at Carnegie Mellon University. ClickHouse shared his "best database name of all time" award with Postgres. -::: + :::info + Many years after ClickHouse got its name, this approach of combining two words that are meaningful on their own has been highlighted as the best way to name a database in a [research by Andy Pavlo](https://www.cs.cmu.edu/~pavlo/blog/2020/03/on-naming-a-database-management-system.html), an Associate Professor of Databases at Carnegie Mellon University. ClickHouse shared his "best database name of all time" award with Postgres. + ::: diff --git a/docs/faq/general/index.md b/docs/faq/general/index.md index abe0a8decd2..ec91ee5a0ae 100644 --- a/docs/faq/general/index.md +++ b/docs/faq/general/index.md @@ -20,7 +20,6 @@ description: 'Index page listing general questions about ClickHouse' - [Why not use something like MapReduce?](../../faq/general/mapreduce.md) - [How do I contribute code to ClickHouse?](/knowledgebase/how-do-i-contribute-code-to-clickhouse) -:::info Don't see what you're looking for? -Check out our [Knowledge Base](/knowledgebase/) and also browse the many helpful articles found here in the documentation. -::: - + :::info Don't see what you're looking for? + Check out our [Knowledge Base](/knowledgebase/) and also browse the many helpful articles found here in the documentation. + ::: diff --git a/docs/faq/general/mapreduce.md b/docs/faq/general/mapreduce.md index b056ea32858..91009e30fc9 100644 --- a/docs/faq/general/mapreduce.md +++ b/docs/faq/general/mapreduce.md @@ -9,7 +9,7 @@ keywords: ['MapReduce'] # Why not use something like MapReduce? {#why-not-use-something-like-mapreduce} -We can refer to systems like MapReduce as distributed computing systems in which the reduce operation is based on distributed sorting. The most common open-source solution in this class is [Apache Hadoop](http://hadoop.apache.org). +We can refer to systems like MapReduce as distributed computing systems in which the reduce operation is based on distributed sorting. The most common open-source solution in this class is [Apache Hadoop](http://hadoop.apache.org). These systems aren't appropriate for online queries due to their high latency. In other words, they can't be used as the back-end for a web interface. These types of systems aren't useful for real-time data updates. Distributed sorting isn't the best way to perform reduce operations if the result of the operation and all the intermediate results (if there are any) are located in the RAM of a single server, which is usually the case for online queries. In such a case, a hash table is an optimal way to perform reduce operations. A common approach to optimizing map-reduce tasks is pre-aggregation (partial reduce) using a hash table in RAM. The user performs this optimization manually. Distributed sorting is one of the main causes of reduced performance when running simple map-reduce tasks. diff --git a/docs/faq/general/ne-tormozit.md b/docs/faq/general/ne-tormozit.md index ec09494e9a8..79f1a444fe1 100644 --- a/docs/faq/general/ne-tormozit.md +++ b/docs/faq/general/ne-tormozit.md @@ -13,8 +13,8 @@ We often get this question when people see vintage (limited production) ClickHou Before ClickHouse became open-source, it was developed as an in-house storage system by a large European IT company, [Yandex](https://yandex.com/company/). That's why it initially got its slogan in Cyrillic, which is "не тормозит" (pronounced as "ne tormozit"). After the open-source release, we first produced some of those t-shirts for local events, and it was a no-brainer to use the slogan as-is. -A second batch of these t-shirts was supposed to be given away at international events, and we tried to make an English version of the slogan. -Unfortunately, we just couldn't come up with a punchy equivalent in English. The original phrase is elegant in its expression while being succinct, and restrictions on space on the t-shirt meant that we failed to come up with a good enough translation as most options appeared to be either too long or inaccurate. +A second batch of these t-shirts was supposed to be given away at international events, and we tried to make an English version of the slogan. +Unfortunately, we just couldn't come up with a punchy equivalent in English. The original phrase is elegant in its expression while being succinct, and restrictions on space on the t-shirt meant that we failed to come up with a good enough translation as most options appeared to be either too long or inaccurate. We decided to keep the slogan even on t-shirts produced for international events. It appeared to be a great decision because people all over the world were positively surprised and curious when they saw it. So, what does it mean? Here are some ways to translate *"не тормозит"*: @@ -22,10 +22,10 @@ So, what does it mean? Here are some ways to translate *"не тормозит"* - If you translate it literally, it sounds something like *"ClickHouse does not press the brake pedal"*. - Shorter, but less precise translations might be *"ClickHouse is not slow"*, *"ClickHouse does not lag"* or just *"ClickHouse is fast"*. -If you haven't seen one of those t-shirts in person, you can check them out online in many ClickHouse-related videos. For example, this one: + If you haven't seen one of those t-shirts in person, you can check them out online in many ClickHouse-related videos. For example, this one: -
- -
+
+ +
-_P.S. These t-shirts are not for sale_, they were given away for free at some [ClickHouse Meetups](https://www.meetup.com/pro/clickhouse/), usually as a gift for best questions or other forms of active participation. Now, these t-shirts are no longer produced, and they have become highly valued collector's items. + _P.S. These t-shirts are not for sale_, they were given away for free at some [ClickHouse Meetups](https://www.meetup.com/pro/clickhouse/), usually as a gift for best questions or other forms of active participation. Now, these t-shirts are no longer produced, and they have become highly valued collector's items. diff --git a/docs/faq/integration/index.md b/docs/faq/integration/index.md index 76939b7bfb3..fa0683ef67e 100644 --- a/docs/faq/integration/index.md +++ b/docs/faq/integration/index.md @@ -17,6 +17,6 @@ description: 'Landing page listing questions related to integrating ClickHouse w - [Can ClickHouse read tables from PostgreSQL](/integrations/data-ingestion/dbms/postgresql/connecting-to-postgresql.md) - [What if I have a problem with encodings when connecting to Oracle via ODBC?](/faq/integration/oracle-odbc.md) -:::info Don't see what you're looking for? -Check out our [Knowledge Base](/knowledgebase/) and also browse the many helpful articles found here in the documentation. -::: + :::info Don't see what you're looking for? + Check out our [Knowledge Base](/knowledgebase/) and also browse the many helpful articles found here in the documentation. + ::: diff --git a/docs/faq/integration/json-import.md b/docs/faq/integration/json-import.md index 6363b725a52..4d99b0c08b5 100644 --- a/docs/faq/integration/json-import.md +++ b/docs/faq/integration/json-import.md @@ -31,6 +31,6 @@ Instead of inserting data manually, you might consider to use an [integration to - `input_format_skip_unknown_fields` allows to insert JSON even if there were additional fields not present in table schema (by discarding them). - `input_format_import_nested_json` allows to insert nested JSON objects into columns of [Nested](../../sql-reference/data-types/nested-data-structures/index.md) type. -:::note -Settings are specified as `GET` parameters for the HTTP interface or as additional command-line arguments prefixed with `--` for the `CLI` interface. -::: + :::note + Settings are specified as `GET` parameters for the HTTP interface or as additional command-line arguments prefixed with `--` for the `CLI` interface. + ::: diff --git a/docs/faq/operations/index.md b/docs/faq/operations/index.md index 2253a55fc7a..4bf59cf1371 100644 --- a/docs/faq/operations/index.md +++ b/docs/faq/operations/index.md @@ -17,7 +17,6 @@ description: 'Landing page for questions about operating ClickHouse servers and - [Can you update or delete rows in ClickHouse?](/guides/developer/mutations.md) - [Does ClickHouse support multi-region replication?](/faq/operations/multi-region-replication.md) -:::info Don't see what you're looking for? -Check out our [Knowledge Base](/knowledgebase/) and also browse the many helpful articles found here in the documentation. -::: - + :::info Don't see what you're looking for? + Check out our [Knowledge Base](/knowledgebase/) and also browse the many helpful articles found here in the documentation. + ::: diff --git a/docs/faq/operations/production.md b/docs/faq/operations/production.md index 193e01358cc..73567c6fb94 100644 --- a/docs/faq/operations/production.md +++ b/docs/faq/operations/production.md @@ -10,10 +10,10 @@ description: 'This page provides guidance on which ClickHouse version to use in First of all, let's discuss why people ask this question in the first place. There are two key reasons: -1. ClickHouse is developed with pretty high velocity, and usually there are 10+ stable releases per year. That makes a wide range of releases to choose from, which is not so trivial of a choice. -2. Some users want to avoid spending time figuring out which version works best for their use case and just follow someone else's advice. +1. ClickHouse is developed with pretty high velocity, and usually there are 10+ stable releases per year. That makes a wide range of releases to choose from, which is not so trivial of a choice. +2. Some users want to avoid spending time figuring out which version works best for their use case and just follow someone else's advice. -The second reason is more fundamental, so we'll start with that one and then get back to navigating through various ClickHouse releases. + The second reason is more fundamental, so we'll start with that one and then get back to navigating through various ClickHouse releases. ## Which ClickHouse version do you recommend? {#which-clickhouse-version-do-you-recommend} @@ -33,35 +33,35 @@ Here are some key points to get reasonable fidelity in a pre-production environm - If your production uses complex features like replication, distributed tables and cascading materialized views, make sure they are configured similarly in pre-production. - There's a trade-off on using the roughly same number of servers or VMs in pre-production as in production but of smaller size, or much less of them but of the same size. The first option might catch extra network-related issues, while the latter is easier to manage. -The second area to invest in is **automated testing infrastructure**. Don't assume that if some kind of query has executed successfully once, it'll continue to do so forever. It's OK to have some unit tests where ClickHouse is mocked, but make sure your product has a reasonable set of automated tests that are run against real ClickHouse and check that all important use cases are still working as expected. + The second area to invest in is **automated testing infrastructure**. Don't assume that if some kind of query has executed successfully once, it'll continue to do so forever. It's OK to have some unit tests where ClickHouse is mocked, but make sure your product has a reasonable set of automated tests that are run against real ClickHouse and check that all important use cases are still working as expected. -An extra step forward could be contributing those automated tests to [ClickHouse's open-source test infrastructure](https://github.com/ClickHouse/ClickHouse/tree/master/tests) that are continuously used in its day-to-day development. It definitely will take some additional time and effort to learn [how to run it](../../development/tests.md) and then how to adapt your tests to this framework, but it'll pay off by ensuring that ClickHouse releases are already tested against them when they are announced stable, instead of repeatedly losing time on reporting the issue after the fact and then waiting for a bugfix to be implemented, backported and released. Some companies even have such test contributions to infrastructure by its use as an internal policy, (called [Beyonce's Rule](https://www.oreilly.com/library/view/software-engineering-at/9781492082781/ch01.html#policies_that_scale_well) at Google). + An extra step forward could be contributing those automated tests to [ClickHouse's open-source test infrastructure](https://github.com/ClickHouse/ClickHouse/tree/master/tests) that are continuously used in its day-to-day development. It definitely will take some additional time and effort to learn [how to run it](../../development/tests.md) and then how to adapt your tests to this framework, but it'll pay off by ensuring that ClickHouse releases are already tested against them when they are announced stable, instead of repeatedly losing time on reporting the issue after the fact and then waiting for a bugfix to be implemented, backported and released. Some companies even have such test contributions to infrastructure by its use as an internal policy, (called [Beyonce's Rule](https://www.oreilly.com/library/view/software-engineering-at/9781492082781/ch01.html#policies_that_scale_well) at Google). -When you have your pre-production environment and testing infrastructure in place, choosing the best version is straightforward: + When you have your pre-production environment and testing infrastructure in place, choosing the best version is straightforward: -1. Routinely run your automated tests against new ClickHouse releases. You can do it even for ClickHouse releases that are marked as `testing`, but going forward to the next steps with them is not recommended. -2. Deploy the ClickHouse release that passed the tests to pre-production and check that all processes are running as expected. -3. Report any issues you discovered to [ClickHouse GitHub Issues](https://github.com/ClickHouse/ClickHouse/issues). -4. If there were no major issues, it should be safe to start deploying ClickHouse release to your production environment. Investing in gradual release automation that implements an approach similar to [canary releases](https://martinfowler.com/bliki/CanaryRelease.html) or [green-blue deployments](https://martinfowler.com/bliki/BlueGreenDeployment.html) might further reduce the risk of issues in production. +1. Routinely run your automated tests against new ClickHouse releases. You can do it even for ClickHouse releases that are marked as `testing`, but going forward to the next steps with them is not recommended. +2. Deploy the ClickHouse release that passed the tests to pre-production and check that all processes are running as expected. +3. Report any issues you discovered to [ClickHouse GitHub Issues](https://github.com/ClickHouse/ClickHouse/issues). +4. If there were no major issues, it should be safe to start deploying ClickHouse release to your production environment. Investing in gradual release automation that implements an approach similar to [canary releases](https://martinfowler.com/bliki/CanaryRelease.html) or [green-blue deployments](https://martinfowler.com/bliki/BlueGreenDeployment.html) might further reduce the risk of issues in production. -As you might have noticed, there's nothing specific to ClickHouse in the approach described above - people do that for any piece of infrastructure they rely on if they take their production environment seriously. + As you might have noticed, there's nothing specific to ClickHouse in the approach described above - people do that for any piece of infrastructure they rely on if they take their production environment seriously. ## How to choose between ClickHouse releases? {#how-to-choose-between-clickhouse-releases} If you look into the contents of the ClickHouse package repository, you'll see two kinds of packages: -1. `stable` -2. `lts` (long-term support) +1. `stable` +2. `lts` (long-term support) -Here is some guidance on how to choose between them: + Here is some guidance on how to choose between them: - `stable` is the kind of package we recommend by default. They are released roughly monthly (and thus provide new features with reasonable delay) and three latest stable releases are supported in terms of diagnostics and backporting of bug fixes. - `lts` are released twice a year and are supported for a year after their initial release. You might prefer them over `stable` in the following cases: - Your company has some internal policies that do not allow for frequent upgrades or using non-LTS software. - You are using ClickHouse in some secondary products that either do not require any complex ClickHouse features or do not have enough resources to keep it updated. -Many teams who initially think that `lts` is the way to go often switch to `stable` anyway because of some recent feature that's important for their product. + Many teams who initially think that `lts` is the way to go often switch to `stable` anyway because of some recent feature that's important for their product. -:::tip -One more thing to keep in mind when upgrading ClickHouse: we're always keeping an eye on compatibility across releases, but sometimes it's not reasonable to keep and some minor details might change. So make sure you check the [changelog](/whats-new/changelog/index.md) before upgrading to see if there are any notes about backward-incompatible changes. -::: + :::tip + One more thing to keep in mind when upgrading ClickHouse: we're always keeping an eye on compatibility across releases, but sometimes it's not reasonable to keep and some minor details might change. So make sure you check the [changelog](/whats-new/changelog/index.md) before upgrading to see if there are any notes about backward-incompatible changes. + ::: diff --git a/docs/faq/operations/separate_storage.md b/docs/faq/operations/separate_storage.md index bc9245a9353..be3eae0247e 100644 --- a/docs/faq/operations/separate_storage.md +++ b/docs/faq/operations/separate_storage.md @@ -10,4 +10,3 @@ description: 'This page provides an answer as to whether it is possible to deplo The short answer is "yes". Object storage (S3, GCS) can be used as the elastic primary storage backend for data in ClickHouse tables. [S3-backed MergeTree](/integrations/data-ingestion/s3/index.md) and [GCS-backed MergeTree](/integrations/data-ingestion/gcs/index.md) guides are published. Only metadata is stored locally on compute nodes in this configuration. You can easily upscale and downscale compute resources in this setup as additional nodes only need to replicate metadata. - diff --git a/docs/faq/use-cases/index.md b/docs/faq/use-cases/index.md index 6331eb4d6f0..f36740e2610 100644 --- a/docs/faq/use-cases/index.md +++ b/docs/faq/use-cases/index.md @@ -11,7 +11,6 @@ description: 'Landing page listing common questions about ClickHouse use cases' - [Can I use ClickHouse as a time-series database?](/knowledgebase/time-series) - [Can I use ClickHouse as a key-value storage?](/knowledgebase/key-value) -:::info Don't see what you're looking for? -Check out our [Knowledge Base](/knowledgebase/) and also browse the many helpful articles found here in the documentation. -::: - + :::info Don't see what you're looking for? + Check out our [Knowledge Base](/knowledgebase/) and also browse the many helpful articles found here in the documentation. + ::: diff --git a/docs/getting-started/example-datasets/amazon-reviews.md b/docs/getting-started/example-datasets/amazon-reviews.md index 7234e594489..fdac4863aac 100644 --- a/docs/getting-started/example-datasets/amazon-reviews.md +++ b/docs/getting-started/example-datasets/amazon-reviews.md @@ -16,77 +16,77 @@ The queries below were executed on a **Production** instance of ClickHouse Cloud 1. Without inserting the data into ClickHouse, we can query it in place. Let's grab some rows, so we can see what they look like: -```sql -SELECT * -FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_2015.snappy.parquet') -LIMIT 3 -``` - -The rows look like: - -```response -Row 1: -────── -review_date: 16462 -marketplace: US -customer_id: 25444946 -- 25.44 million -review_id: R146L9MMZYG0WA -product_id: B00NV85102 -product_parent: 908181913 -- 908.18 million -product_title: XIKEZAN iPhone 6 Plus 5.5 inch Waterproof Case, Shockproof Dirtproof Snowproof Full Body Skin Case Protective Cover with Hand Strap & Headphone Adapter & Kickstand -product_category: Wireless -star_rating: 4 -helpful_votes: 0 -total_votes: 0 -vine: false -verified_purchase: true -review_headline: case is sturdy and protects as I want -review_body: I won't count on the waterproof part (I took off the rubber seals at the bottom because the got on my nerves). But the case is sturdy and protects as I want. - -Row 2: -────── -review_date: 16462 -marketplace: US -customer_id: 1974568 -- 1.97 million -review_id: R2LXDXT293LG1T -product_id: B00OTFZ23M -product_parent: 951208259 -- 951.21 million -product_title: Season.C Chicago Bulls Marilyn Monroe No.1 Hard Back Case Cover for Samsung Galaxy S5 i9600 -product_category: Wireless -star_rating: 1 -helpful_votes: 0 -total_votes: 0 -vine: false -verified_purchase: true -review_headline: One Star -review_body: Cant use the case because its big for the phone. Waist of money! - -Row 3: -────── -review_date: 16462 -marketplace: US -customer_id: 24803564 -- 24.80 million -review_id: R7K9U5OEIRJWR -product_id: B00LB8C4U4 -product_parent: 524588109 -- 524.59 million -product_title: iPhone 5s Case, BUDDIBOX [Shield] Slim Dual Layer Protective Case with Kickstand for Apple iPhone 5 and 5s -product_category: Wireless -star_rating: 4 -helpful_votes: 0 -total_votes: 0 -vine: false -verified_purchase: true -review_headline: but overall this case is pretty sturdy and provides good protection for the phone -review_body: The front piece was a little difficult to secure to the phone at first, but overall this case is pretty sturdy and provides good protection for the phone, which is what I need. I would buy this case again. -``` + ```sql + SELECT * + FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_2015.snappy.parquet') + LIMIT 3 + ``` + + The rows look like: + + ```response + Row 1: + ────── + review_date: 16462 + marketplace: US + customer_id: 25444946 -- 25.44 million + review_id: R146L9MMZYG0WA + product_id: B00NV85102 + product_parent: 908181913 -- 908.18 million + product_title: XIKEZAN iPhone 6 Plus 5.5 inch Waterproof Case, Shockproof Dirtproof Snowproof Full Body Skin Case Protective Cover with Hand Strap & Headphone Adapter & Kickstand + product_category: Wireless + star_rating: 4 + helpful_votes: 0 + total_votes: 0 + vine: false + verified_purchase: true + review_headline: case is sturdy and protects as I want + review_body: I won't count on the waterproof part (I took off the rubber seals at the bottom because the got on my nerves). But the case is sturdy and protects as I want. + + Row 2: + ────── + review_date: 16462 + marketplace: US + customer_id: 1974568 -- 1.97 million + review_id: R2LXDXT293LG1T + product_id: B00OTFZ23M + product_parent: 951208259 -- 951.21 million + product_title: Season.C Chicago Bulls Marilyn Monroe No.1 Hard Back Case Cover for Samsung Galaxy S5 i9600 + product_category: Wireless + star_rating: 1 + helpful_votes: 0 + total_votes: 0 + vine: false + verified_purchase: true + review_headline: One Star + review_body: Cant use the case because its big for the phone. Waist of money! + + Row 3: + ────── + review_date: 16462 + marketplace: US + customer_id: 24803564 -- 24.80 million + review_id: R7K9U5OEIRJWR + product_id: B00LB8C4U4 + product_parent: 524588109 -- 524.59 million + product_title: iPhone 5s Case, BUDDIBOX [Shield] Slim Dual Layer Protective Case with Kickstand for Apple iPhone 5 and 5s + product_category: Wireless + star_rating: 4 + helpful_votes: 0 + total_votes: 0 + vine: false + verified_purchase: true + review_headline: but overall this case is pretty sturdy and provides good protection for the phone + review_body: The front piece was a little difficult to secure to the phone at first, but overall this case is pretty sturdy and provides good protection for the phone, which is what I need. I would buy this case again. + ``` 2. Let's define a new `MergeTree` table named `amazon_reviews` to store this data in ClickHouse: -```sql -CREATE DATABASE amazon + ```sql + CREATE DATABASE amazon -CREATE TABLE amazon.amazon_reviews -( + CREATE TABLE amazon.amazon_reviews + ( `review_date` Date, `marketplace` LowCardinality(String), `customer_id` UInt64, @@ -107,134 +107,133 @@ CREATE TABLE amazon.amazon_reviews SELECT * ORDER BY helpful_votes ) -) -ENGINE = MergeTree -ORDER BY (review_date, product_category) -``` + ) + ENGINE = MergeTree + ORDER BY (review_date, product_category) + ``` 3. The following `INSERT` command uses the `s3Cluster` table function, which allows the processing of multiple S3 files in parallel using all the nodes of your cluster. We also use a wildcard to insert any file that starts with the name `https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_*.snappy.parquet`: -```sql -INSERT INTO amazon.amazon_reviews SELECT * -FROM s3Cluster('default', -'https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_*.snappy.parquet') -``` + ```sql + INSERT INTO amazon.amazon_reviews SELECT * + FROM s3Cluster('default', + 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/amazon_reviews/amazon_reviews_*.snappy.parquet') + ``` -:::tip -In ClickHouse Cloud, the name of the cluster is `default`. Change `default` to the name of your cluster...or use the `s3` table function (instead of `s3Cluster`) if you do not have a cluster. -::: + :::tip + In ClickHouse Cloud, the name of the cluster is `default`. Change `default` to the name of your cluster...or use the `s3` table function (instead of `s3Cluster`) if you do not have a cluster. + ::: 5. That query doesn't take long - averaging about 300,000 rows per second. Within 5 minutes or so you should see all the rows inserted: -```sql runnable -SELECT formatReadableQuantity(count()) -FROM amazon.amazon_reviews -``` + ```sql runnable + SELECT formatReadableQuantity(count()) + FROM amazon.amazon_reviews + ``` 6. Let's see how much space our data is using: -```sql runnable -SELECT + ```sql runnable + SELECT disk_name, formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed, formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed, round(usize / size, 2) AS compr_rate, sum(rows) AS rows, count() AS part_count -FROM system.parts -WHERE (active = 1) AND (table = 'amazon_reviews') -GROUP BY disk_name -ORDER BY size DESC -``` - -The original data was about 70G, but compressed in ClickHouse it takes up about 30G. + FROM system.parts + WHERE (active = 1) AND (table = 'amazon_reviews') + GROUP BY disk_name + ORDER BY size DESC + ``` + The original data was about 70G, but compressed in ClickHouse it takes up about 30G. ## Example queries {#example-queries} 7. Let's run some queries. Here are the top 10 most-helpful reviews in the dataset: -```sql runnable -SELECT + ```sql runnable + SELECT product_title, review_headline -FROM amazon.amazon_reviews -ORDER BY helpful_votes DESC -LIMIT 10 -``` + FROM amazon.amazon_reviews + ORDER BY helpful_votes DESC + LIMIT 10 + ``` -:::note -This query is using a [projection](/data-modeling/projections) to speed up performance. -::: + :::note + This query is using a [projection](/data-modeling/projections) to speed up performance. + ::: 8. Here are the top 10 products in Amazon with the most reviews: -```sql runnable -SELECT + ```sql runnable + SELECT any(product_title), count() -FROM amazon.amazon_reviews -GROUP BY product_id -ORDER BY 2 DESC -LIMIT 10; -``` + FROM amazon.amazon_reviews + GROUP BY product_id + ORDER BY 2 DESC + LIMIT 10; + ``` 9. Here are the average review ratings per month for each product (an actual [Amazon job interview question](https://datalemur.com/questions/sql-avg-review-ratings)!): -```sql runnable -SELECT + ```sql runnable + SELECT toStartOfMonth(review_date) AS month, any(product_title), avg(star_rating) AS avg_stars -FROM amazon.amazon_reviews -GROUP BY + FROM amazon.amazon_reviews + GROUP BY month, product_id -ORDER BY + ORDER BY month DESC, product_id ASC -LIMIT 20; -``` + LIMIT 20; + ``` 10. Here are the total number of votes per product category. This query is fast because `product_category` is in the primary key: -```sql runnable -SELECT + ```sql runnable + SELECT sum(total_votes), product_category -FROM amazon.amazon_reviews -GROUP BY product_category -ORDER BY 1 DESC -``` + FROM amazon.amazon_reviews + GROUP BY product_category + ORDER BY 1 DESC + ``` 11. Let's find the products with the word **"awful"** occurring most frequently in the review. This is a big task - over 151M strings have to be parsed looking for a single word: -```sql runnable settings={'enable_parallel_replicas':1} -SELECT + ```sql runnable settings={'enable_parallel_replicas':1} + SELECT product_id, any(product_title), avg(star_rating), count() AS count -FROM amazon.amazon_reviews -WHERE position(review_body, 'awful') > 0 -GROUP BY product_id -ORDER BY count DESC -LIMIT 50; -``` + FROM amazon.amazon_reviews + WHERE position(review_body, 'awful') > 0 + GROUP BY product_id + ORDER BY count DESC + LIMIT 50; + ``` -Notice the query time for such a large amount of data. The results are also a fun read! + Notice the query time for such a large amount of data. The results are also a fun read! 12. We can run the same query again, except this time we search for **awesome** in the reviews: -```sql runnable settings={'enable_parallel_replicas':1} -SELECT + ```sql runnable settings={'enable_parallel_replicas':1} + SELECT product_id, any(product_title), avg(star_rating), count() AS count -FROM amazon.amazon_reviews -WHERE position(review_body, 'awesome') > 0 -GROUP BY product_id -ORDER BY count DESC -LIMIT 50; -``` + FROM amazon.amazon_reviews + WHERE position(review_body, 'awesome') > 0 + GROUP BY product_id + ORDER BY count DESC + LIMIT 50; + ``` diff --git a/docs/getting-started/example-datasets/brown-benchmark.md b/docs/getting-started/example-datasets/brown-benchmark.md index 59f07d3197e..8ad45ec5d8a 100644 --- a/docs/getting-started/example-datasets/brown-benchmark.md +++ b/docs/getting-started/example-datasets/brown-benchmark.md @@ -54,7 +54,6 @@ ENGINE = MergeTree() ORDER BY (machine_group, machine_name, log_time); ``` - ```sql CREATE TABLE mgbench.logs2 ( log_time DateTime, @@ -67,7 +66,6 @@ ENGINE = MergeTree() ORDER BY log_time; ``` - ```sql CREATE TABLE mgbench.logs3 ( log_time DateTime64, @@ -122,7 +120,6 @@ FROM ( GROUP BY machine_name; ``` - ```sql -- Q1.2: Which computer lab machines have been offline in the past day? @@ -254,7 +251,6 @@ WHERE status_code >= 200 AND log_time < TIMESTAMP '2012-05-20 00:00:00'; ``` - ```sql -- Q2.3: What was the average path depth for top-level requests in the past month? @@ -280,7 +276,6 @@ GROUP BY top_level ORDER BY top_level; ``` - ```sql -- Q2.4: During the last 3 months, which clients have made an excessive number of requests? @@ -293,7 +288,6 @@ HAVING COUNT(*) >= 100000 ORDER BY num_requests DESC; ``` - ```sql -- Q2.5: What are the daily unique visitors? @@ -308,7 +302,6 @@ GROUP BY dt ORDER BY dt; ``` - ```sql -- Q2.6: What are the average and maximum data transfer rates (Gbps)? @@ -322,7 +315,6 @@ FROM ( ) AS r; ``` - ```sql -- Q3.1: Did the indoor temperature reach freezing over the weekend? @@ -333,7 +325,6 @@ WHERE event_type = 'temperature' AND log_time >= '2019-11-29 17:00:00.000'; ``` - ```sql -- Q3.4: Over the past 6 months, how frequently were each door opened? @@ -407,7 +398,6 @@ WHERE dt >= DATE '2019-06-01' AND dt < DATE '2019-09-01'; ``` - ```sql -- Q3.6: For each device category, what are the monthly power consumption metrics? diff --git a/docs/getting-started/example-datasets/cell-towers.md b/docs/getting-started/example-datasets/cell-towers.md index 5626cb4bf89..92caf13add7 100644 --- a/docs/getting-started/example-datasets/cell-towers.md +++ b/docs/getting-started/example-datasets/cell-towers.md @@ -36,9 +36,9 @@ In this guide you will learn how to: - Connect Apache Superset to ClickHouse - Build a dashboard based on data available in the dataset -Here is a preview of the dashboard created in this guide: + Here is a preview of the dashboard created in this guide: -Dashboard of cell towers by radio type in mcc 204 + Dashboard of cell towers by radio type in mcc 204 ## Get the dataset {#get-the-dataset} @@ -50,23 +50,16 @@ OpenCelliD Project is licensed under a Creative Commons Attribution-ShareAlike 4 - ### Load the sample data {#load-the-sample-data} - ClickHouse Cloud provides an easy-button for uploading this dataset from S3. Log in to your ClickHouse Cloud organization, or create a free trial at [ClickHouse.cloud](https://clickhouse.cloud). - Choose the **Cell Towers** dataset from the **Sample data** tab, and **Load data**: - Load cell towers dataset - ### Examine the schema of the cell_towers table {#examine-the-schema-of-the-cell_towers-table} ```sql DESCRIBE TABLE cell_towers ``` - - This is the output of `DESCRIBE`. Down further in this guide the field type choices will be described. ```response ┌─name──────────┬─type──────────────────────────────────────────────────────────────────┬ @@ -86,39 +79,33 @@ This is the output of `DESCRIBE`. Down further in this guide the field type cho │ averageSignal │ UInt8 │ └───────────────┴───────────────────────────────────────────────────────────────────────┴ ``` - - 1. Create a table: - ```sql CREATE TABLE cell_towers ( - radio Enum8('' = 0, 'CDMA' = 1, 'GSM' = 2, 'LTE' = 3, 'NR' = 4, 'UMTS' = 5), - mcc UInt16, - net UInt16, - area UInt16, - cell UInt64, - unit Int16, - lon Float64, - lat Float64, - range UInt32, - samples UInt32, - changeable UInt8, - created DateTime, - updated DateTime, - averageSignal UInt8 +radio Enum8('' = 0, 'CDMA' = 1, 'GSM' = 2, 'LTE' = 3, 'NR' = 4, 'UMTS' = 5), +mcc UInt16, +net UInt16, +area UInt16, +cell UInt64, +unit Int16, +lon Float64, +lat Float64, +range UInt32, +samples UInt32, +changeable UInt8, +created DateTime, +updated DateTime, +averageSignal UInt8 ) ENGINE = MergeTree ORDER BY (radio, mcc, net, created); ``` - 2. Import the dataset from a public S3 bucket (686 MB): - ```sql INSERT INTO cell_towers SELECT * FROM s3('https://datasets-documentation.s3.amazonaws.com/cell_towers/cell_towers.csv.xz', 'CSVWithNames') ``` - @@ -126,46 +113,46 @@ INSERT INTO cell_towers SELECT * FROM s3('https://datasets-documentation.s3.amaz 1. A number of cell towers by type: -```sql -SELECT radio, count() AS c FROM cell_towers GROUP BY radio ORDER BY c DESC -``` -```response -┌─radio─┬────────c─┐ -│ UMTS │ 20686487 │ -│ LTE │ 12101148 │ -│ GSM │ 9931304 │ -│ CDMA │ 556344 │ -│ NR │ 867 │ -└───────┴──────────┘ - -5 rows in set. Elapsed: 0.011 sec. Processed 43.28 million rows, 43.28 MB (3.83 billion rows/s., 3.83 GB/s.) -``` + ```sql + SELECT radio, count() AS c FROM cell_towers GROUP BY radio ORDER BY c DESC + ``` + ```response + ┌─radio─┬────────c─┐ + │ UMTS │ 20686487 │ + │ LTE │ 12101148 │ + │ GSM │ 9931304 │ + │ CDMA │ 556344 │ + │ NR │ 867 │ + └───────┴──────────┘ + + 5 rows in set. Elapsed: 0.011 sec. Processed 43.28 million rows, 43.28 MB (3.83 billion rows/s., 3.83 GB/s.) + ``` 2. Cell towers by [mobile country code (MCC)](https://en.wikipedia.org/wiki/Mobile_country_code): -```sql -SELECT mcc, count() FROM cell_towers GROUP BY mcc ORDER BY count() DESC LIMIT 10 -``` -```response -┌─mcc─┬─count()─┐ -│ 310 │ 5024650 │ -│ 262 │ 2622423 │ -│ 250 │ 1953176 │ -│ 208 │ 1891187 │ -│ 724 │ 1836150 │ -│ 404 │ 1729151 │ -│ 234 │ 1618924 │ -│ 510 │ 1353998 │ -│ 440 │ 1343355 │ -│ 311 │ 1332798 │ -└─────┴─────────┘ - -10 rows in set. Elapsed: 0.019 sec. Processed 43.28 million rows, 86.55 MB (2.33 billion rows/s., 4.65 GB/s.) -``` - -Based on the above query and the [MCC list](https://en.wikipedia.org/wiki/Mobile_country_code), the countries with the most cell towers are: the USA, Germany, and Russia. - -You may want to create a [Dictionary](../../sql-reference/dictionaries/index.md) in ClickHouse to decode these values. + ```sql + SELECT mcc, count() FROM cell_towers GROUP BY mcc ORDER BY count() DESC LIMIT 10 + ``` + ```response + ┌─mcc─┬─count()─┐ + │ 310 │ 5024650 │ + │ 262 │ 2622423 │ + │ 250 │ 1953176 │ + │ 208 │ 1891187 │ + │ 724 │ 1836150 │ + │ 404 │ 1729151 │ + │ 234 │ 1618924 │ + │ 510 │ 1353998 │ + │ 440 │ 1343355 │ + │ 311 │ 1332798 │ + └─────┴─────────┘ + + 10 rows in set. Elapsed: 0.019 sec. Processed 43.28 million rows, 86.55 MB (2.33 billion rows/s., 4.65 GB/s.) + ``` + + Based on the above query and the [MCC list](https://en.wikipedia.org/wiki/Mobile_country_code), the countries with the most cell towers are: the USA, Germany, and Russia. + + You may want to create a [Dictionary](../../sql-reference/dictionaries/index.md) in ClickHouse to decode these values. ## Use case: incorporate geo data {#use-case} @@ -175,90 +162,86 @@ Using the [`pointInPolygon`](/sql-reference/functions/geo/coordinates.md/#pointi - ```sql CREATE TABLE moscow (polygon Array(Tuple(Float64, Float64))) ORDER BY polygon; ``` - - ```sql CREATE TEMPORARY TABLE moscow (polygon Array(Tuple(Float64, Float64))); ``` - 2. This is a rough shape of Moscow (without "new Moscow"): -```sql -INSERT INTO moscow VALUES ([(37.84172564285271, 55.78000432402266), -(37.8381207618713, 55.775874525970494), (37.83979446823122, 55.775626746008065), (37.84243326983639, 55.77446586811748), (37.84262672750849, 55.771974101091104), (37.84153238623039, 55.77114545193181), (37.841124690460184, 55.76722010265554), -(37.84239076983644, 55.76654891107098), (37.842283558197025, 55.76258709833121), (37.8421759312134, 55.758073999993734), (37.84198330422974, 55.75381499999371), (37.8416827275085, 55.749277102484484), (37.84157576190186, 55.74794544108413), -(37.83897929098507, 55.74525257875241), (37.83739676451868, 55.74404373042019), (37.838732481460525, 55.74298009816793), (37.841183997352545, 55.743060321833575), (37.84097476190185, 55.73938799999373), (37.84048155819702, 55.73570799999372), -(37.840095812164286, 55.73228210777237), (37.83983814285274, 55.73080491981639), (37.83846476321406, 55.729799917464675), (37.83835745269769, 55.72919751082619), (37.838636380279524, 55.72859509486539), (37.8395161005249, 55.727705075632784), -(37.83897964285276, 55.722727886185154), (37.83862557539366, 55.72034817326636), (37.83559735744853, 55.71944437307499), (37.835370708803126, 55.71831419154461), (37.83738169402022, 55.71765218986692), (37.83823396494291, 55.71691750159089), -(37.838056931213345, 55.71547311301385), (37.836812846557606, 55.71221445615604), (37.83522525396725, 55.709331054395555), (37.83269301586908, 55.70953687463627), (37.829667367706236, 55.70903403789297), (37.83311126588435, 55.70552351822608), -(37.83058993121339, 55.70041317726053), (37.82983872750851, 55.69883771404813), (37.82934501586913, 55.69718947487017), (37.828926414016685, 55.69504441658371), (37.82876530422971, 55.69287499999378), (37.82894754100031, 55.690759754047335), -(37.827697554878185, 55.68951421135665), (37.82447346292115, 55.68965045405069), (37.83136543914793, 55.68322046195302), (37.833554015869154, 55.67814012759211), (37.83544184655761, 55.67295011628339), (37.837480388885474, 55.6672498719639), -(37.838960677246064, 55.66316274139358), (37.83926093121332, 55.66046999999383), (37.839025050262435, 55.65869897264431), (37.83670784390257, 55.65794084879904), (37.835656529083245, 55.65694309303843), (37.83704060449217, 55.65689306460552), -(37.83696819873806, 55.65550363526252), (37.83760389616388, 55.65487847246661), (37.83687972750851, 55.65356745541324), (37.83515216004943, 55.65155951234079), (37.83312418518067, 55.64979413590619), (37.82801726983639, 55.64640836412121), -(37.820614174591, 55.64164525405531), (37.818908190475426, 55.6421883258084), (37.81717543386075, 55.64112490388471), (37.81690987037274, 55.63916106913107), (37.815099354492155, 55.637925371757085), (37.808769150787356, 55.633798276884455), -(37.80100123544311, 55.62873670012244), (37.79598013491824, 55.62554336109055), (37.78634567724606, 55.62033499605651), (37.78334147619623, 55.618768681480326), (37.77746201055901, 55.619855533402706), (37.77527329626457, 55.61909966711279), -(37.77801986242668, 55.618770300976294), (37.778212973541216, 55.617257701952106), (37.77784818518065, 55.61574504433011), (37.77016867724609, 55.61148576294007), (37.760191219573976, 55.60599579539028), (37.75338926983641, 55.60227892751446), -(37.746329965606634, 55.59920577639331), (37.73939925396728, 55.59631430313617), (37.73273665739439, 55.5935318803559), (37.7299954450912, 55.59350760316188), (37.7268679946899, 55.59469840523759), (37.72626726983634, 55.59229549697373), -(37.7262673598022, 55.59081598950582), (37.71897193121335, 55.5877595845419), (37.70871550793456, 55.58393177431724), (37.700497489410374, 55.580917323756644), (37.69204305026244, 55.57778089778455), (37.68544477378839, 55.57815154690915), -(37.68391050793454, 55.57472945079756), (37.678803592590306, 55.57328235936491), (37.6743402539673, 55.57255251445782), (37.66813862698363, 55.57216388774464), (37.617927457672096, 55.57505691895805), (37.60443099999999, 55.5757737568051), -(37.599683515869145, 55.57749105910326), (37.59754177842709, 55.57796291823627), (37.59625834786988, 55.57906686095235), (37.59501783265684, 55.57746616444403), (37.593090671936025, 55.57671634534502), (37.587018007904, 55.577944600233785), -(37.578692203704804, 55.57982895000019), (37.57327546607398, 55.58116294118248), (37.57385012109279, 55.581550362779), (37.57399562266922, 55.5820107079112), (37.5735356072979, 55.58226289171689), (37.57290393054962, 55.582393529795155), -(37.57037722355653, 55.581919415056234), (37.5592298306885, 55.584471614867844), (37.54189249206543, 55.58867650795186), (37.5297256269836, 55.59158133551745), (37.517837865081766, 55.59443656218868), (37.51200186508174, 55.59635625174229), -(37.506808949737554, 55.59907823904434), (37.49820432275389, 55.6062944994944), (37.494406071441674, 55.60967103463367), (37.494760001358024, 55.61066689753365), (37.49397137107085, 55.61220931698269), (37.49016528606031, 55.613417718449064), -(37.48773249206542, 55.61530616333343), (37.47921386508177, 55.622640129112334), (37.470652153442394, 55.62993723476164), (37.46273446298218, 55.6368075123157), (37.46350692265317, 55.64068225239439), (37.46050283203121, 55.640794546982576), -(37.457627470916734, 55.64118904154646), (37.450718034393326, 55.64690488145138), (37.44239252645875, 55.65397824729769), (37.434587576721185, 55.66053543155961), (37.43582144975277, 55.661693766520735), (37.43576786245721, 55.662755031737014), -(37.430982915344174, 55.664610641628116), (37.428547447097685, 55.66778515273695), (37.42945134592044, 55.668633314343566), (37.42859571562949, 55.66948145750025), (37.4262836402282, 55.670813882451405), (37.418709037048295, 55.6811141674414), -(37.41922139651101, 55.68235377885389), (37.419218771842885, 55.68359335082235), (37.417196501327446, 55.684375235224735), (37.41607020370478, 55.68540557585352), (37.415640857147146, 55.68686637150793), (37.414632153442334, 55.68903015131686), -(37.413344899475064, 55.690896881757396), (37.41171432275391, 55.69264232162232), (37.40948282275393, 55.69455101638112), (37.40703674603271, 55.69638690385348), (37.39607169577025, 55.70451821283731), (37.38952706878662, 55.70942491932811), -(37.387778313491815, 55.71149057784176), (37.39049275399779, 55.71419814298992), (37.385557272491454, 55.7155489617061), (37.38388335714726, 55.71849856042102), (37.378368238098155, 55.7292763261685), (37.37763597123337, 55.730845879211614), -(37.37890062088197, 55.73167906388319), (37.37750451918789, 55.734703664681774), (37.375610832015965, 55.734851959522246), (37.3723813571472, 55.74105626086403), (37.37014935714723, 55.746115620904355), (37.36944173016362, 55.750883999993725), -(37.36975304365541, 55.76335905525834), (37.37244070571134, 55.76432079697595), (37.3724259757175, 55.76636979670426), (37.369922155757884, 55.76735417953104), (37.369892695770275, 55.76823419316575), (37.370214730163575, 55.782312184391266), -(37.370493611114505, 55.78436801120489), (37.37120164550783, 55.78596427165359), (37.37284851456452, 55.7874378183096), (37.37608325135799, 55.7886695054807), (37.3764587460632, 55.78947647305964), (37.37530000265506, 55.79146512926804), -(37.38235915344241, 55.79899647809345), (37.384344043655396, 55.80113596939471), (37.38594269577028, 55.80322699999366), (37.38711208598329, 55.804919036911976), (37.3880239841309, 55.806610999993666), (37.38928977249147, 55.81001864976979), -(37.39038389947512, 55.81348641242801), (37.39235781481933, 55.81983538336746), (37.393709457672124, 55.82417822811877), (37.394685720901464, 55.82792275755836), (37.39557615344238, 55.830447148154136), (37.39844478226658, 55.83167107969975), -(37.40019761214057, 55.83151823557964), (37.400398790382326, 55.83264967594742), (37.39659544313046, 55.83322180909622), (37.39667059524539, 55.83402792148566), (37.39682089947515, 55.83638877400216), (37.39643489154053, 55.83861656112751), -(37.3955338994751, 55.84072348043264), (37.392680272491454, 55.84502158126453), (37.39241188227847, 55.84659117913199), (37.392529730163616, 55.84816071336481), (37.39486835714723, 55.85288092980303), (37.39873052645878, 55.859893456073635), -(37.40272161111449, 55.86441833633205), (37.40697072750854, 55.867579567544375), (37.410007082016016, 55.868369880337), (37.4120992989502, 55.86920843741314), (37.412668021163924, 55.87055369615854), (37.41482461111453, 55.87170587948249), -(37.41862266137694, 55.873183961039565), (37.42413732540892, 55.874879126654704), (37.4312182698669, 55.875614937236705), (37.43111093783558, 55.8762723478417), (37.43332105622856, 55.87706546369396), (37.43385747619623, 55.87790681284802), -(37.441303050262405, 55.88027084462084), (37.44747234260555, 55.87942070143253), (37.44716141796871, 55.88072960917233), (37.44769797085568, 55.88121221323979), (37.45204320500181, 55.882080694420715), (37.45673176190186, 55.882346110794586), -(37.463383999999984, 55.88252729504517), (37.46682797486874, 55.88294937719063), (37.470014457672086, 55.88361266759345), (37.47751410450743, 55.88546991372396), (37.47860317658232, 55.88534929207307), (37.48165826025772, 55.882563306475106), -(37.48316434442331, 55.8815803226785), (37.483831555817645, 55.882427612793315), (37.483182967125686, 55.88372791409729), (37.483092277908824, 55.88495581062434), (37.4855716508179, 55.8875561994203), (37.486440636245746, 55.887827444039566), -(37.49014203439328, 55.88897899871799), (37.493210285705544, 55.890208937135604), (37.497512451065035, 55.891342397444696), (37.49780744510645, 55.89174030252967), (37.49940333499519, 55.89239745507079), (37.50018383334346, 55.89339220941865), -(37.52421672750851, 55.903869074155224), (37.52977457672118, 55.90564076517974), (37.53503220370484, 55.90661661218259), (37.54042858064267, 55.90714113744566), (37.54320461007303, 55.905645048442985), (37.545686966066306, 55.906608607018505), -(37.54743976120755, 55.90788552162358), (37.55796999999999, 55.90901557907218), (37.572711542327866, 55.91059395704873), (37.57942799999998, 55.91073854155573), (37.58502865872187, 55.91009969268444), (37.58739968913264, 55.90794809960554), -(37.59131567193598, 55.908713267595054), (37.612687423278814, 55.902866854295375), (37.62348079629517, 55.90041967242986), (37.635797880950896, 55.898141151686396), (37.649487626983664, 55.89639275532968), (37.65619302513125, 55.89572360207488), -(37.66294133862307, 55.895295577183965), (37.66874564418033, 55.89505457604897), (37.67375601586915, 55.89254677027454), (37.67744661901856, 55.8947775867987), (37.688347, 55.89450045676125), (37.69480554232789, 55.89422926332761), -(37.70107096560668, 55.89322256101114), (37.705962965606716, 55.891763491662616), (37.711885134918205, 55.889110234998974), (37.71682005026245, 55.886577568759876), (37.7199315476074, 55.88458159806678), (37.72234560316464, 55.882281005794134), -(37.72364385977171, 55.8809452036196), (37.725371142837474, 55.8809722706006), (37.727870902099546, 55.88037213862385), (37.73394330422971, 55.877941504088696), (37.745339592590376, 55.87208120378722), (37.75525267724611, 55.86703807949492), -(37.76919976190188, 55.859821640197474), (37.827835219574, 55.82962968399116), (37.83341438888553, 55.82575289922351), (37.83652584655761, 55.82188784027888), (37.83809213491821, 55.81612575504693), (37.83605359521481, 55.81460347077685), -(37.83632178569025, 55.81276696067908), (37.838623105812026, 55.811486181656385), (37.83912198147584, 55.807329380532785), (37.839079078033414, 55.80510270463816), (37.83965844708251, 55.79940712529036), (37.840581150787344, 55.79131399999368), -(37.84172564285271, 55.78000432402266)]); -``` + ```sql + INSERT INTO moscow VALUES ([(37.84172564285271, 55.78000432402266), + (37.8381207618713, 55.775874525970494), (37.83979446823122, 55.775626746008065), (37.84243326983639, 55.77446586811748), (37.84262672750849, 55.771974101091104), (37.84153238623039, 55.77114545193181), (37.841124690460184, 55.76722010265554), + (37.84239076983644, 55.76654891107098), (37.842283558197025, 55.76258709833121), (37.8421759312134, 55.758073999993734), (37.84198330422974, 55.75381499999371), (37.8416827275085, 55.749277102484484), (37.84157576190186, 55.74794544108413), + (37.83897929098507, 55.74525257875241), (37.83739676451868, 55.74404373042019), (37.838732481460525, 55.74298009816793), (37.841183997352545, 55.743060321833575), (37.84097476190185, 55.73938799999373), (37.84048155819702, 55.73570799999372), + (37.840095812164286, 55.73228210777237), (37.83983814285274, 55.73080491981639), (37.83846476321406, 55.729799917464675), (37.83835745269769, 55.72919751082619), (37.838636380279524, 55.72859509486539), (37.8395161005249, 55.727705075632784), + (37.83897964285276, 55.722727886185154), (37.83862557539366, 55.72034817326636), (37.83559735744853, 55.71944437307499), (37.835370708803126, 55.71831419154461), (37.83738169402022, 55.71765218986692), (37.83823396494291, 55.71691750159089), + (37.838056931213345, 55.71547311301385), (37.836812846557606, 55.71221445615604), (37.83522525396725, 55.709331054395555), (37.83269301586908, 55.70953687463627), (37.829667367706236, 55.70903403789297), (37.83311126588435, 55.70552351822608), + (37.83058993121339, 55.70041317726053), (37.82983872750851, 55.69883771404813), (37.82934501586913, 55.69718947487017), (37.828926414016685, 55.69504441658371), (37.82876530422971, 55.69287499999378), (37.82894754100031, 55.690759754047335), + (37.827697554878185, 55.68951421135665), (37.82447346292115, 55.68965045405069), (37.83136543914793, 55.68322046195302), (37.833554015869154, 55.67814012759211), (37.83544184655761, 55.67295011628339), (37.837480388885474, 55.6672498719639), + (37.838960677246064, 55.66316274139358), (37.83926093121332, 55.66046999999383), (37.839025050262435, 55.65869897264431), (37.83670784390257, 55.65794084879904), (37.835656529083245, 55.65694309303843), (37.83704060449217, 55.65689306460552), + (37.83696819873806, 55.65550363526252), (37.83760389616388, 55.65487847246661), (37.83687972750851, 55.65356745541324), (37.83515216004943, 55.65155951234079), (37.83312418518067, 55.64979413590619), (37.82801726983639, 55.64640836412121), + (37.820614174591, 55.64164525405531), (37.818908190475426, 55.6421883258084), (37.81717543386075, 55.64112490388471), (37.81690987037274, 55.63916106913107), (37.815099354492155, 55.637925371757085), (37.808769150787356, 55.633798276884455), + (37.80100123544311, 55.62873670012244), (37.79598013491824, 55.62554336109055), (37.78634567724606, 55.62033499605651), (37.78334147619623, 55.618768681480326), (37.77746201055901, 55.619855533402706), (37.77527329626457, 55.61909966711279), + (37.77801986242668, 55.618770300976294), (37.778212973541216, 55.617257701952106), (37.77784818518065, 55.61574504433011), (37.77016867724609, 55.61148576294007), (37.760191219573976, 55.60599579539028), (37.75338926983641, 55.60227892751446), + (37.746329965606634, 55.59920577639331), (37.73939925396728, 55.59631430313617), (37.73273665739439, 55.5935318803559), (37.7299954450912, 55.59350760316188), (37.7268679946899, 55.59469840523759), (37.72626726983634, 55.59229549697373), + (37.7262673598022, 55.59081598950582), (37.71897193121335, 55.5877595845419), (37.70871550793456, 55.58393177431724), (37.700497489410374, 55.580917323756644), (37.69204305026244, 55.57778089778455), (37.68544477378839, 55.57815154690915), + (37.68391050793454, 55.57472945079756), (37.678803592590306, 55.57328235936491), (37.6743402539673, 55.57255251445782), (37.66813862698363, 55.57216388774464), (37.617927457672096, 55.57505691895805), (37.60443099999999, 55.5757737568051), + (37.599683515869145, 55.57749105910326), (37.59754177842709, 55.57796291823627), (37.59625834786988, 55.57906686095235), (37.59501783265684, 55.57746616444403), (37.593090671936025, 55.57671634534502), (37.587018007904, 55.577944600233785), + (37.578692203704804, 55.57982895000019), (37.57327546607398, 55.58116294118248), (37.57385012109279, 55.581550362779), (37.57399562266922, 55.5820107079112), (37.5735356072979, 55.58226289171689), (37.57290393054962, 55.582393529795155), + (37.57037722355653, 55.581919415056234), (37.5592298306885, 55.584471614867844), (37.54189249206543, 55.58867650795186), (37.5297256269836, 55.59158133551745), (37.517837865081766, 55.59443656218868), (37.51200186508174, 55.59635625174229), + (37.506808949737554, 55.59907823904434), (37.49820432275389, 55.6062944994944), (37.494406071441674, 55.60967103463367), (37.494760001358024, 55.61066689753365), (37.49397137107085, 55.61220931698269), (37.49016528606031, 55.613417718449064), + (37.48773249206542, 55.61530616333343), (37.47921386508177, 55.622640129112334), (37.470652153442394, 55.62993723476164), (37.46273446298218, 55.6368075123157), (37.46350692265317, 55.64068225239439), (37.46050283203121, 55.640794546982576), + (37.457627470916734, 55.64118904154646), (37.450718034393326, 55.64690488145138), (37.44239252645875, 55.65397824729769), (37.434587576721185, 55.66053543155961), (37.43582144975277, 55.661693766520735), (37.43576786245721, 55.662755031737014), + (37.430982915344174, 55.664610641628116), (37.428547447097685, 55.66778515273695), (37.42945134592044, 55.668633314343566), (37.42859571562949, 55.66948145750025), (37.4262836402282, 55.670813882451405), (37.418709037048295, 55.6811141674414), + (37.41922139651101, 55.68235377885389), (37.419218771842885, 55.68359335082235), (37.417196501327446, 55.684375235224735), (37.41607020370478, 55.68540557585352), (37.415640857147146, 55.68686637150793), (37.414632153442334, 55.68903015131686), + (37.413344899475064, 55.690896881757396), (37.41171432275391, 55.69264232162232), (37.40948282275393, 55.69455101638112), (37.40703674603271, 55.69638690385348), (37.39607169577025, 55.70451821283731), (37.38952706878662, 55.70942491932811), + (37.387778313491815, 55.71149057784176), (37.39049275399779, 55.71419814298992), (37.385557272491454, 55.7155489617061), (37.38388335714726, 55.71849856042102), (37.378368238098155, 55.7292763261685), (37.37763597123337, 55.730845879211614), + (37.37890062088197, 55.73167906388319), (37.37750451918789, 55.734703664681774), (37.375610832015965, 55.734851959522246), (37.3723813571472, 55.74105626086403), (37.37014935714723, 55.746115620904355), (37.36944173016362, 55.750883999993725), + (37.36975304365541, 55.76335905525834), (37.37244070571134, 55.76432079697595), (37.3724259757175, 55.76636979670426), (37.369922155757884, 55.76735417953104), (37.369892695770275, 55.76823419316575), (37.370214730163575, 55.782312184391266), + (37.370493611114505, 55.78436801120489), (37.37120164550783, 55.78596427165359), (37.37284851456452, 55.7874378183096), (37.37608325135799, 55.7886695054807), (37.3764587460632, 55.78947647305964), (37.37530000265506, 55.79146512926804), + (37.38235915344241, 55.79899647809345), (37.384344043655396, 55.80113596939471), (37.38594269577028, 55.80322699999366), (37.38711208598329, 55.804919036911976), (37.3880239841309, 55.806610999993666), (37.38928977249147, 55.81001864976979), + (37.39038389947512, 55.81348641242801), (37.39235781481933, 55.81983538336746), (37.393709457672124, 55.82417822811877), (37.394685720901464, 55.82792275755836), (37.39557615344238, 55.830447148154136), (37.39844478226658, 55.83167107969975), + (37.40019761214057, 55.83151823557964), (37.400398790382326, 55.83264967594742), (37.39659544313046, 55.83322180909622), (37.39667059524539, 55.83402792148566), (37.39682089947515, 55.83638877400216), (37.39643489154053, 55.83861656112751), + (37.3955338994751, 55.84072348043264), (37.392680272491454, 55.84502158126453), (37.39241188227847, 55.84659117913199), (37.392529730163616, 55.84816071336481), (37.39486835714723, 55.85288092980303), (37.39873052645878, 55.859893456073635), + (37.40272161111449, 55.86441833633205), (37.40697072750854, 55.867579567544375), (37.410007082016016, 55.868369880337), (37.4120992989502, 55.86920843741314), (37.412668021163924, 55.87055369615854), (37.41482461111453, 55.87170587948249), + (37.41862266137694, 55.873183961039565), (37.42413732540892, 55.874879126654704), (37.4312182698669, 55.875614937236705), (37.43111093783558, 55.8762723478417), (37.43332105622856, 55.87706546369396), (37.43385747619623, 55.87790681284802), + (37.441303050262405, 55.88027084462084), (37.44747234260555, 55.87942070143253), (37.44716141796871, 55.88072960917233), (37.44769797085568, 55.88121221323979), (37.45204320500181, 55.882080694420715), (37.45673176190186, 55.882346110794586), + (37.463383999999984, 55.88252729504517), (37.46682797486874, 55.88294937719063), (37.470014457672086, 55.88361266759345), (37.47751410450743, 55.88546991372396), (37.47860317658232, 55.88534929207307), (37.48165826025772, 55.882563306475106), + (37.48316434442331, 55.8815803226785), (37.483831555817645, 55.882427612793315), (37.483182967125686, 55.88372791409729), (37.483092277908824, 55.88495581062434), (37.4855716508179, 55.8875561994203), (37.486440636245746, 55.887827444039566), + (37.49014203439328, 55.88897899871799), (37.493210285705544, 55.890208937135604), (37.497512451065035, 55.891342397444696), (37.49780744510645, 55.89174030252967), (37.49940333499519, 55.89239745507079), (37.50018383334346, 55.89339220941865), + (37.52421672750851, 55.903869074155224), (37.52977457672118, 55.90564076517974), (37.53503220370484, 55.90661661218259), (37.54042858064267, 55.90714113744566), (37.54320461007303, 55.905645048442985), (37.545686966066306, 55.906608607018505), + (37.54743976120755, 55.90788552162358), (37.55796999999999, 55.90901557907218), (37.572711542327866, 55.91059395704873), (37.57942799999998, 55.91073854155573), (37.58502865872187, 55.91009969268444), (37.58739968913264, 55.90794809960554), + (37.59131567193598, 55.908713267595054), (37.612687423278814, 55.902866854295375), (37.62348079629517, 55.90041967242986), (37.635797880950896, 55.898141151686396), (37.649487626983664, 55.89639275532968), (37.65619302513125, 55.89572360207488), + (37.66294133862307, 55.895295577183965), (37.66874564418033, 55.89505457604897), (37.67375601586915, 55.89254677027454), (37.67744661901856, 55.8947775867987), (37.688347, 55.89450045676125), (37.69480554232789, 55.89422926332761), + (37.70107096560668, 55.89322256101114), (37.705962965606716, 55.891763491662616), (37.711885134918205, 55.889110234998974), (37.71682005026245, 55.886577568759876), (37.7199315476074, 55.88458159806678), (37.72234560316464, 55.882281005794134), + (37.72364385977171, 55.8809452036196), (37.725371142837474, 55.8809722706006), (37.727870902099546, 55.88037213862385), (37.73394330422971, 55.877941504088696), (37.745339592590376, 55.87208120378722), (37.75525267724611, 55.86703807949492), + (37.76919976190188, 55.859821640197474), (37.827835219574, 55.82962968399116), (37.83341438888553, 55.82575289922351), (37.83652584655761, 55.82188784027888), (37.83809213491821, 55.81612575504693), (37.83605359521481, 55.81460347077685), + (37.83632178569025, 55.81276696067908), (37.838623105812026, 55.811486181656385), (37.83912198147584, 55.807329380532785), (37.839079078033414, 55.80510270463816), (37.83965844708251, 55.79940712529036), (37.840581150787344, 55.79131399999368), + (37.84172564285271, 55.78000432402266)]); + ``` 3. Check how many cell towers are in Moscow: -```sql -SELECT count() FROM cell_towers -WHERE pointInPolygon((lon, lat), (SELECT * FROM moscow)) -``` -```response -┌─count()─┐ -│ 310463 │ -└─────────┘ + ```sql + SELECT count() FROM cell_towers + WHERE pointInPolygon((lon, lat), (SELECT * FROM moscow)) + ``` + ```response + ┌─count()─┐ + │ 310463 │ + └─────────┘ -1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.) -``` + 1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.) + ``` ## Review of the schema {#review-of-the-schema} @@ -282,7 +265,7 @@ The schema for this table was designed for compact storage on disk and query spe - `mcc` or Mobile country code, is stored as a `UInt16` as we know the range is 1 - 999. - `lon` and `lat` are `Float64`. -None of the other fields are used in the queries or visualizations in this guide, but they are described in the forum linked above if you are interested. + None of the other fields are used in the queries or visualizations in this guide, but they are described in the forum linked above if you are interested. ## Build visualizations with Apache Superset {#build-visualizations-with-apache-superset} @@ -303,7 +286,7 @@ To build a Superset dashboard using the OpenCelliD dataset you should: In Superset a database can be added by choosing the database type, and then providing the connection details. Open Superset and look for the **+**, it has a menu with **Data** and then **Connect database** options. Add a database - + Choose **ClickHouse Connect** from the list: Choose clickhouse connect as database type diff --git a/docs/getting-started/example-datasets/covid19.md b/docs/getting-started/example-datasets/covid19.md index 64e534eff11..d8eb7ba28c1 100644 --- a/docs/getting-started/example-datasets/covid19.md +++ b/docs/getting-started/example-datasets/covid19.md @@ -18,58 +18,58 @@ The following commands were executed on a **Production** instance of [ClickHouse 1. Let's see what the data looks like: -```sql -DESCRIBE url( + ```sql + DESCRIBE url( 'https://storage.googleapis.com/covid19-open-data/v3/epidemiology.csv', 'CSVWithNames' -); -``` - -The CSV file has 10 columns: - -```response -┌─name─────────────────┬─type─────────────┐ -│ date │ Nullable(Date) │ -│ location_key │ Nullable(String) │ -│ new_confirmed │ Nullable(Int64) │ -│ new_deceased │ Nullable(Int64) │ -│ new_recovered │ Nullable(Int64) │ -│ new_tested │ Nullable(Int64) │ -│ cumulative_confirmed │ Nullable(Int64) │ -│ cumulative_deceased │ Nullable(Int64) │ -│ cumulative_recovered │ Nullable(Int64) │ -│ cumulative_tested │ Nullable(Int64) │ -└──────────────────────┴──────────────────┘ - -10 rows in set. Elapsed: 0.745 sec. -``` + ); + ``` + + The CSV file has 10 columns: + + ```response + ┌─name─────────────────┬─type─────────────┐ + │ date │ Nullable(Date) │ + │ location_key │ Nullable(String) │ + │ new_confirmed │ Nullable(Int64) │ + │ new_deceased │ Nullable(Int64) │ + │ new_recovered │ Nullable(Int64) │ + │ new_tested │ Nullable(Int64) │ + │ cumulative_confirmed │ Nullable(Int64) │ + │ cumulative_deceased │ Nullable(Int64) │ + │ cumulative_recovered │ Nullable(Int64) │ + │ cumulative_tested │ Nullable(Int64) │ + └──────────────────────┴──────────────────┘ + + 10 rows in set. Elapsed: 0.745 sec. + ``` 2. Now let's view some of the rows: -```sql -SELECT * -FROM url('https://storage.googleapis.com/covid19-open-data/v3/epidemiology.csv') -LIMIT 100; -``` - -Notice the `url` function easily reads data from a CSV file: - -```response -┌─c1─────────┬─c2───────────┬─c3────────────┬─c4───────────┬─c5────────────┬─c6─────────┬─c7───────────────────┬─c8──────────────────┬─c9───────────────────┬─c10───────────────┐ -│ date │ location_key │ new_confirmed │ new_deceased │ new_recovered │ new_tested │ cumulative_confirmed │ cumulative_deceased │ cumulative_recovered │ cumulative_tested │ -│ 2020-04-03 │ AD │ 24 │ 1 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 466 │ 17 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -│ 2020-04-04 │ AD │ 57 │ 0 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 523 │ 17 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -│ 2020-04-05 │ AD │ 17 │ 4 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 540 │ 21 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -│ 2020-04-06 │ AD │ 11 │ 1 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 551 │ 22 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -│ 2020-04-07 │ AD │ 15 │ 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 566 │ 24 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -│ 2020-04-08 │ AD │ 23 │ 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 589 │ 26 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -└────────────┴──────────────┴───────────────┴──────────────┴───────────────┴────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────┘ -``` + ```sql + SELECT * + FROM url('https://storage.googleapis.com/covid19-open-data/v3/epidemiology.csv') + LIMIT 100; + ``` + + Notice the `url` function easily reads data from a CSV file: + + ```response + ┌─c1─────────┬─c2───────────┬─c3────────────┬─c4───────────┬─c5────────────┬─c6─────────┬─c7───────────────────┬─c8──────────────────┬─c9───────────────────┬─c10───────────────┐ + │ date │ location_key │ new_confirmed │ new_deceased │ new_recovered │ new_tested │ cumulative_confirmed │ cumulative_deceased │ cumulative_recovered │ cumulative_tested │ + │ 2020-04-03 │ AD │ 24 │ 1 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 466 │ 17 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ + │ 2020-04-04 │ AD │ 57 │ 0 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 523 │ 17 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ + │ 2020-04-05 │ AD │ 17 │ 4 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 540 │ 21 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ + │ 2020-04-06 │ AD │ 11 │ 1 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 551 │ 22 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ + │ 2020-04-07 │ AD │ 15 │ 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 566 │ 24 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ + │ 2020-04-08 │ AD │ 23 │ 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 589 │ 26 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ + └────────────┴──────────────┴───────────────┴──────────────┴───────────────┴────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────┘ + ``` 3. We will create a table now that we know what the data looks like: -```sql -CREATE TABLE covid19 ( + ```sql + CREATE TABLE covid19 ( date Date, location_key LowCardinality(String), new_confirmed Int32, @@ -80,17 +80,17 @@ CREATE TABLE covid19 ( cumulative_deceased Int32, cumulative_recovered Int32, cumulative_tested Int32 -) -ENGINE = MergeTree -ORDER BY (location_key, date); -``` + ) + ENGINE = MergeTree + ORDER BY (location_key, date); + ``` 4. The following command inserts the entire dataset into the `covid19` table: -```sql -INSERT INTO covid19 - SELECT * - FROM + ```sql + INSERT INTO covid19 + SELECT * + FROM url( 'https://storage.googleapis.com/covid19-open-data/v3/epidemiology.csv', CSVWithNames, @@ -105,162 +105,162 @@ INSERT INTO covid19 cumulative_recovered Int32, cumulative_tested Int32' ); -``` + ``` 5. It goes pretty quick - let's see how many rows were inserted: -```sql -SELECT formatReadableQuantity(count()) -FROM covid19; -``` + ```sql + SELECT formatReadableQuantity(count()) + FROM covid19; + ``` -```response -┌─formatReadableQuantity(count())─┐ -│ 12.53 million │ -└─────────────────────────────────┘ -``` + ```response + ┌─formatReadableQuantity(count())─┐ + │ 12.53 million │ + └─────────────────────────────────┘ + ``` 6. Let's see how many total cases of Covid-19 were recorded: -```sql -SELECT formatReadableQuantity(sum(new_confirmed)) -FROM covid19; -``` + ```sql + SELECT formatReadableQuantity(sum(new_confirmed)) + FROM covid19; + ``` -```response -┌─formatReadableQuantity(sum(new_confirmed))─┐ -│ 1.39 billion │ -└────────────────────────────────────────────┘ -``` + ```response + ┌─formatReadableQuantity(sum(new_confirmed))─┐ + │ 1.39 billion │ + └────────────────────────────────────────────┘ + ``` 7. You will notice the data has a lot of 0's for dates - either weekends or days when numbers were not reported each day. We can use a window function to smooth out the daily averages of new cases: -```sql -SELECT - AVG(new_confirmed) OVER (PARTITION BY location_key ORDER BY date ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS cases_smoothed, - new_confirmed, - location_key, - date -FROM covid19; -``` + ```sql + SELECT + AVG(new_confirmed) OVER (PARTITION BY location_key ORDER BY date ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS cases_smoothed, + new_confirmed, + location_key, + date + FROM covid19; + ``` 8. This query determines the latest values for each location. We can't use `max(date)` because not all countries reported every day, so we grab the last row using `ROW_NUMBER`: -```sql -WITH latest_deaths_data AS - ( SELECT location_key, + ```sql + WITH latest_deaths_data AS + ( SELECT location_key, date, new_deceased, new_confirmed, ROW_NUMBER() OVER (PARTITION BY location_key ORDER BY date DESC) AS rn FROM covid19) -SELECT location_key, + SELECT location_key, date, new_deceased, new_confirmed, rn -FROM latest_deaths_data -WHERE rn=1; -``` + FROM latest_deaths_data + WHERE rn=1; + ``` 9. We can use `lagInFrame` to determine the `LAG` of new cases each day. In this query we filter by the `US_DC` location: -```sql -SELECT - new_confirmed - lagInFrame(new_confirmed,1) OVER (PARTITION BY location_key ORDER BY date) AS confirmed_cases_delta, - new_confirmed, - location_key, - date -FROM covid19 -WHERE location_key = 'US_DC'; -``` - -The response look like: - -```response -┌─confirmed_cases_delta─┬─new_confirmed─┬─location_key─┬───────date─┐ -│ 0 │ 0 │ US_DC │ 2020-03-08 │ -│ 2 │ 2 │ US_DC │ 2020-03-09 │ -│ -2 │ 0 │ US_DC │ 2020-03-10 │ -│ 6 │ 6 │ US_DC │ 2020-03-11 │ -│ -6 │ 0 │ US_DC │ 2020-03-12 │ -│ 0 │ 0 │ US_DC │ 2020-03-13 │ -│ 6 │ 6 │ US_DC │ 2020-03-14 │ -│ -5 │ 1 │ US_DC │ 2020-03-15 │ -│ 4 │ 5 │ US_DC │ 2020-03-16 │ -│ 4 │ 9 │ US_DC │ 2020-03-17 │ -│ -1 │ 8 │ US_DC │ 2020-03-18 │ -│ 24 │ 32 │ US_DC │ 2020-03-19 │ -│ -26 │ 6 │ US_DC │ 2020-03-20 │ -│ 15 │ 21 │ US_DC │ 2020-03-21 │ -│ -3 │ 18 │ US_DC │ 2020-03-22 │ -│ 3 │ 21 │ US_DC │ 2020-03-23 │ -``` + ```sql + SELECT + new_confirmed - lagInFrame(new_confirmed,1) OVER (PARTITION BY location_key ORDER BY date) AS confirmed_cases_delta, + new_confirmed, + location_key, + date + FROM covid19 + WHERE location_key = 'US_DC'; + ``` + + The response look like: + + ```response + ┌─confirmed_cases_delta─┬─new_confirmed─┬─location_key─┬───────date─┐ + │ 0 │ 0 │ US_DC │ 2020-03-08 │ + │ 2 │ 2 │ US_DC │ 2020-03-09 │ + │ -2 │ 0 │ US_DC │ 2020-03-10 │ + │ 6 │ 6 │ US_DC │ 2020-03-11 │ + │ -6 │ 0 │ US_DC │ 2020-03-12 │ + │ 0 │ 0 │ US_DC │ 2020-03-13 │ + │ 6 │ 6 │ US_DC │ 2020-03-14 │ + │ -5 │ 1 │ US_DC │ 2020-03-15 │ + │ 4 │ 5 │ US_DC │ 2020-03-16 │ + │ 4 │ 9 │ US_DC │ 2020-03-17 │ + │ -1 │ 8 │ US_DC │ 2020-03-18 │ + │ 24 │ 32 │ US_DC │ 2020-03-19 │ + │ -26 │ 6 │ US_DC │ 2020-03-20 │ + │ 15 │ 21 │ US_DC │ 2020-03-21 │ + │ -3 │ 18 │ US_DC │ 2020-03-22 │ + │ 3 │ 21 │ US_DC │ 2020-03-23 │ + ``` 10. This query calculates the percentage of change in new cases each day, and includes a simple `increase` or `decrease` column in the result set: -```sql -WITH confirmed_lag AS ( - SELECT + ```sql + WITH confirmed_lag AS ( + SELECT *, lagInFrame(new_confirmed) OVER( PARTITION BY location_key ORDER BY date ) AS confirmed_previous_day - FROM covid19 -), -confirmed_percent_change AS ( - SELECT + FROM covid19 + ), + confirmed_percent_change AS ( + SELECT *, COALESCE(ROUND((new_confirmed - confirmed_previous_day) / confirmed_previous_day * 100), 0) AS percent_change - FROM confirmed_lag -) -SELECT - date, - new_confirmed, - percent_change, - CASE + FROM confirmed_lag + ) + SELECT + date, + new_confirmed, + percent_change, + CASE WHEN percent_change > 0 THEN 'increase' WHEN percent_change = 0 THEN 'no change' ELSE 'decrease' - END AS trend -FROM confirmed_percent_change -WHERE location_key = 'US_DC'; -``` - -The results look like - -```response -┌───────date─┬─new_confirmed─┬─percent_change─┬─trend─────┐ -│ 2020-03-08 │ 0 │ nan │ decrease │ -│ 2020-03-09 │ 2 │ inf │ increase │ -│ 2020-03-10 │ 0 │ -100 │ decrease │ -│ 2020-03-11 │ 6 │ inf │ increase │ -│ 2020-03-12 │ 0 │ -100 │ decrease │ -│ 2020-03-13 │ 0 │ nan │ decrease │ -│ 2020-03-14 │ 6 │ inf │ increase │ -│ 2020-03-15 │ 1 │ -83 │ decrease │ -│ 2020-03-16 │ 5 │ 400 │ increase │ -│ 2020-03-17 │ 9 │ 80 │ increase │ -│ 2020-03-18 │ 8 │ -11 │ decrease │ -│ 2020-03-19 │ 32 │ 300 │ increase │ -│ 2020-03-20 │ 6 │ -81 │ decrease │ -│ 2020-03-21 │ 21 │ 250 │ increase │ -│ 2020-03-22 │ 18 │ -14 │ decrease │ -│ 2020-03-23 │ 21 │ 17 │ increase │ -│ 2020-03-24 │ 46 │ 119 │ increase │ -│ 2020-03-25 │ 48 │ 4 │ increase │ -│ 2020-03-26 │ 36 │ -25 │ decrease │ -│ 2020-03-27 │ 37 │ 3 │ increase │ -│ 2020-03-28 │ 38 │ 3 │ increase │ -│ 2020-03-29 │ 59 │ 55 │ increase │ -│ 2020-03-30 │ 94 │ 59 │ increase │ -│ 2020-03-31 │ 91 │ -3 │ decrease │ -│ 2020-04-01 │ 67 │ -26 │ decrease │ -│ 2020-04-02 │ 104 │ 55 │ increase │ -│ 2020-04-03 │ 145 │ 39 │ increase │ -``` - -:::note -As mentioned in the [GitHub repo](https://github.com/GoogleCloudPlatform/covid-19-open-data), the dataset is no longer updated as of September 15, 2022. -::: + END AS trend + FROM confirmed_percent_change + WHERE location_key = 'US_DC'; + ``` + + The results look like + + ```response + ┌───────date─┬─new_confirmed─┬─percent_change─┬─trend─────┐ + │ 2020-03-08 │ 0 │ nan │ decrease │ + │ 2020-03-09 │ 2 │ inf │ increase │ + │ 2020-03-10 │ 0 │ -100 │ decrease │ + │ 2020-03-11 │ 6 │ inf │ increase │ + │ 2020-03-12 │ 0 │ -100 │ decrease │ + │ 2020-03-13 │ 0 │ nan │ decrease │ + │ 2020-03-14 │ 6 │ inf │ increase │ + │ 2020-03-15 │ 1 │ -83 │ decrease │ + │ 2020-03-16 │ 5 │ 400 │ increase │ + │ 2020-03-17 │ 9 │ 80 │ increase │ + │ 2020-03-18 │ 8 │ -11 │ decrease │ + │ 2020-03-19 │ 32 │ 300 │ increase │ + │ 2020-03-20 │ 6 │ -81 │ decrease │ + │ 2020-03-21 │ 21 │ 250 │ increase │ + │ 2020-03-22 │ 18 │ -14 │ decrease │ + │ 2020-03-23 │ 21 │ 17 │ increase │ + │ 2020-03-24 │ 46 │ 119 │ increase │ + │ 2020-03-25 │ 48 │ 4 │ increase │ + │ 2020-03-26 │ 36 │ -25 │ decrease │ + │ 2020-03-27 │ 37 │ 3 │ increase │ + │ 2020-03-28 │ 38 │ 3 │ increase │ + │ 2020-03-29 │ 59 │ 55 │ increase │ + │ 2020-03-30 │ 94 │ 59 │ increase │ + │ 2020-03-31 │ 91 │ -3 │ decrease │ + │ 2020-04-01 │ 67 │ -26 │ decrease │ + │ 2020-04-02 │ 104 │ 55 │ increase │ + │ 2020-04-03 │ 145 │ 39 │ increase │ + ``` + + :::note + As mentioned in the [GitHub repo](https://github.com/GoogleCloudPlatform/covid-19-open-data), the dataset is no longer updated as of September 15, 2022. + ::: diff --git a/docs/getting-started/example-datasets/environmental-sensors.md b/docs/getting-started/example-datasets/environmental-sensors.md index 136cf65bcfd..c68ed87df69 100644 --- a/docs/getting-started/example-datasets/environmental-sensors.md +++ b/docs/getting-started/example-datasets/environmental-sensors.md @@ -18,38 +18,38 @@ The dataset has over 20 billion records, so be careful just copying-and-pasting 1. The data is in S3, so we can use the `s3` table function to create a table from the files. We can also query the data in place. Let's look at a few rows before attempting to insert it into ClickHouse: -```sql -SELECT * -FROM s3( + ```sql + SELECT * + FROM s3( 'https://clickhouse-public-datasets.s3.eu-central-1.amazonaws.com/sensors/monthly/2019-06_bmp180.csv.zst', 'CSVWithNames' - ) -LIMIT 10 -SETTINGS format_csv_delimiter = ';'; -``` - -The data is in CSV files but uses a semi-colon for the delimiter. The rows look like: - -```response -┌─sensor_id─┬─sensor_type─┬─location─┬────lat─┬────lon─┬─timestamp───────────┬──pressure─┬─altitude─┬─pressure_sealevel─┬─temperature─┐ -│ 9119 │ BMP180 │ 4594 │ 50.994 │ 7.126 │ 2019-06-01T00:00:00 │ 101471 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 19.9 │ -│ 21210 │ BMP180 │ 10762 │ 42.206 │ 25.326 │ 2019-06-01T00:00:00 │ 99525 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 19.3 │ -│ 19660 │ BMP180 │ 9978 │ 52.434 │ 17.056 │ 2019-06-01T00:00:04 │ 101570 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 15.3 │ -│ 12126 │ BMP180 │ 6126 │ 57.908 │ 16.49 │ 2019-06-01T00:00:05 │ 101802.56 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 8.07 │ -│ 15845 │ BMP180 │ 8022 │ 52.498 │ 13.466 │ 2019-06-01T00:00:05 │ 101878 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 23 │ -│ 16415 │ BMP180 │ 8316 │ 49.312 │ 6.744 │ 2019-06-01T00:00:06 │ 100176 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 14.7 │ -│ 7389 │ BMP180 │ 3735 │ 50.136 │ 11.062 │ 2019-06-01T00:00:06 │ 98905 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 12.1 │ -│ 13199 │ BMP180 │ 6664 │ 52.514 │ 13.44 │ 2019-06-01T00:00:07 │ 101855.54 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 19.74 │ -│ 12753 │ BMP180 │ 6440 │ 44.616 │ 2.032 │ 2019-06-01T00:00:07 │ 99475 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 17 │ -│ 16956 │ BMP180 │ 8594 │ 52.052 │ 8.354 │ 2019-06-01T00:00:08 │ 101322 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 17.2 │ -└───────────┴─────────────┴──────────┴────────┴────────┴─────────────────────┴───────────┴──────────┴───────────────────┴─────────────┘ -``` + ) + LIMIT 10 + SETTINGS format_csv_delimiter = ';'; + ``` + + The data is in CSV files but uses a semi-colon for the delimiter. The rows look like: + + ```response + ┌─sensor_id─┬─sensor_type─┬─location─┬────lat─┬────lon─┬─timestamp───────────┬──pressure─┬─altitude─┬─pressure_sealevel─┬─temperature─┐ + │ 9119 │ BMP180 │ 4594 │ 50.994 │ 7.126 │ 2019-06-01T00:00:00 │ 101471 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 19.9 │ + │ 21210 │ BMP180 │ 10762 │ 42.206 │ 25.326 │ 2019-06-01T00:00:00 │ 99525 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 19.3 │ + │ 19660 │ BMP180 │ 9978 │ 52.434 │ 17.056 │ 2019-06-01T00:00:04 │ 101570 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 15.3 │ + │ 12126 │ BMP180 │ 6126 │ 57.908 │ 16.49 │ 2019-06-01T00:00:05 │ 101802.56 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 8.07 │ + │ 15845 │ BMP180 │ 8022 │ 52.498 │ 13.466 │ 2019-06-01T00:00:05 │ 101878 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 23 │ + │ 16415 │ BMP180 │ 8316 │ 49.312 │ 6.744 │ 2019-06-01T00:00:06 │ 100176 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 14.7 │ + │ 7389 │ BMP180 │ 3735 │ 50.136 │ 11.062 │ 2019-06-01T00:00:06 │ 98905 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 12.1 │ + │ 13199 │ BMP180 │ 6664 │ 52.514 │ 13.44 │ 2019-06-01T00:00:07 │ 101855.54 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 19.74 │ + │ 12753 │ BMP180 │ 6440 │ 44.616 │ 2.032 │ 2019-06-01T00:00:07 │ 99475 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 17 │ + │ 16956 │ BMP180 │ 8594 │ 52.052 │ 8.354 │ 2019-06-01T00:00:08 │ 101322 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 17.2 │ + └───────────┴─────────────┴──────────┴────────┴────────┴─────────────────────┴───────────┴──────────┴───────────────────┴─────────────┘ + ``` 2. We will use the following `MergeTree` table to store the data in ClickHouse: -```sql -CREATE TABLE sensors -( + ```sql + CREATE TABLE sensors + ( sensor_id UInt16, sensor_type Enum('BME280', 'BMP180', 'BMP280', 'DHT22', 'DS18B20', 'HPM', 'HTU21D', 'PMS1003', 'PMS3003', 'PMS5003', 'PMS6003', 'PMS7003', 'PPD42NS', 'SDS011'), location UInt32, @@ -69,17 +69,17 @@ CREATE TABLE sensors temperature Float32, humidity Float32, date Date MATERIALIZED toDate(timestamp) -) -ENGINE = MergeTree -ORDER BY (timestamp, sensor_id); -``` + ) + ENGINE = MergeTree + ORDER BY (timestamp, sensor_id); + ``` 3. ClickHouse Cloud services have a cluster named `default`. We will use the `s3Cluster` table function, which reads S3 files in parallel from the nodes in your cluster. (If you do not have a cluster, just use the `s3` function and remove the cluster name.) -This query will take a while - it's about 1.67T of data uncompressed: + This query will take a while - it's about 1.67T of data uncompressed: -```sql -INSERT INTO sensors + ```sql + INSERT INTO sensors SELECT * FROM s3Cluster( 'default', @@ -104,7 +104,7 @@ INSERT INTO sensors temperature Float32, humidity Float32 $$ ) -SETTINGS + SETTINGS format_csv_delimiter = ';', input_format_allow_errors_ratio = '0.5', input_format_allow_errors_num = 10000, @@ -112,65 +112,65 @@ SETTINGS date_time_input_format = 'best_effort', max_insert_threads = 32, parallel_distributed_insert_select = 1; -``` + ``` -Here is the response - showing the number of rows and the speed of processing. It is input at a rate of over 6M rows per second! + Here is the response - showing the number of rows and the speed of processing. It is input at a rate of over 6M rows per second! -```response -0 rows in set. Elapsed: 3419.330 sec. Processed 20.69 billion rows, 1.67 TB (6.05 million rows/s., 488.52 MB/s.) -``` + ```response + 0 rows in set. Elapsed: 3419.330 sec. Processed 20.69 billion rows, 1.67 TB (6.05 million rows/s., 488.52 MB/s.) + ``` 4. Let's see how much storage disk is needed for the `sensors` table: -```sql -SELECT + ```sql + SELECT disk_name, formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed, formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed, round(usize / size, 2) AS compr_rate, sum(rows) AS rows, count() AS part_count -FROM system.parts -WHERE (active = 1) AND (table = 'sensors') -GROUP BY + FROM system.parts + WHERE (active = 1) AND (table = 'sensors') + GROUP BY disk_name -ORDER BY size DESC; -``` + ORDER BY size DESC; + ``` -The 1.67T is compressed down to 310 GiB, and there are 20.69 billion rows: + The 1.67T is compressed down to 310 GiB, and there are 20.69 billion rows: -```response -┌─disk_name─┬─compressed─┬─uncompressed─┬─compr_rate─┬────────rows─┬─part_count─┐ -│ s3disk │ 310.21 GiB │ 1.30 TiB │ 4.29 │ 20693971809 │ 472 │ -└───────────┴────────────┴──────────────┴────────────┴─────────────┴────────────┘ -``` + ```response + ┌─disk_name─┬─compressed─┬─uncompressed─┬─compr_rate─┬────────rows─┬─part_count─┐ + │ s3disk │ 310.21 GiB │ 1.30 TiB │ 4.29 │ 20693971809 │ 472 │ + └───────────┴────────────┴──────────────┴────────────┴─────────────┴────────────┘ + ``` 5. Let's analyze the data now that it's in ClickHouse. Notice the quantity of data increases over time as more sensors are deployed: -```sql -SELECT + ```sql + SELECT date, count() -FROM sensors -GROUP BY date -ORDER BY date ASC; -``` + FROM sensors + GROUP BY date + ORDER BY date ASC; + ``` -We can create a chart in the SQL Console to visualize the results: + We can create a chart in the SQL Console to visualize the results: -Number of events per day + Number of events per day 6. This query counts the number of overly hot and humid days: -```sql -WITH + ```sql + WITH toYYYYMMDD(timestamp) AS day -SELECT day, count() FROM sensors -WHERE temperature >= 40 AND temperature <= 50 AND humidity >= 90 -GROUP BY day -ORDER BY day ASC; -``` + SELECT day, count() FROM sensors + WHERE temperature >= 40 AND temperature <= 50 AND humidity >= 90 + GROUP BY day + ORDER BY day ASC; + ``` -Here's a visualization of the result: + Here's a visualization of the result: -Hot and humid days + Hot and humid days diff --git a/docs/getting-started/example-datasets/foursquare-os-places.md b/docs/getting-started/example-datasets/foursquare-os-places.md index 7bd234ebcf2..fb1c785aeec 100644 --- a/docs/getting-started/example-datasets/foursquare-os-places.md +++ b/docs/getting-started/example-datasets/foursquare-os-places.md @@ -1,5 +1,5 @@ --- -description: 'Dataset with over 100 million records containing information about places on a map, such as shops, +description: 'Dataset with over 100 million records containing information about places on a map, such as shops, restaurants, parks, playgrounds, and monuments.' sidebar_label: 'Foursquare places' slug: /getting-started/example-datasets/foursquare-places @@ -18,15 +18,15 @@ import visualization_4 from '@site/static/images/getting-started/example-dataset This dataset by Foursquare is available to [download](https://docs.foursquare.com/data-products/docs/access-fsq-os-places) and to use for free under the Apache 2.0 license. -It contains over 100 million records of commercial points-of-interest (POI), +It contains over 100 million records of commercial points-of-interest (POI), such as shops, restaurants, parks, playgrounds, and monuments. It also includes additional metadata about those places, such as categories and social media information. ## Data exploration {#data-exploration} -For exploring the data we'll use [`clickhouse-local`](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local), a small command-line tool -that provides the full ClickHouse engine, although you could also use +For exploring the data we'll use [`clickhouse-local`](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local), a small command-line tool +that provides the full ClickHouse engine, although you could also use ClickHouse Cloud, `clickhouse-client` or even `chDB`. Run the following query to select the data from the s3 bucket where the data is stored: @@ -145,14 +145,14 @@ DESCRIBE s3('s3://fsq-os-places-us-east-1/release/dt=2025-04-08/places/parquet/* │ │↳ xmax Nullable(Float64),↴│ │ │↳ ymax Nullable(Float64)) │ └─────────────────────┴─────────────────────────────┘ -``` + ``` ## Loading the data into ClickHouse {#loading-the-data} -If you'd like to persist the data on disk, you can use `clickhouse-server` -or ClickHouse Cloud. +If you'd like to persist the data on disk, you can use `clickhouse-server` +or ClickHouse Cloud. -To create the table, run the following command: +To create the table, run the following command: ```sql title="Query" CREATE TABLE foursquare_mercator @@ -197,7 +197,7 @@ CREATE TABLE foursquare_mercator ORDER BY mortonEncode(mercator_x, mercator_y) ``` -Take note of the use of the [`LowCardinality`](/sql-reference/data-types/lowcardinality) +Take note of the use of the [`LowCardinality`](/sql-reference/data-types/lowcardinality) data type for several columns which changes the internal representation of the data types to be dictionary-encoded. Operating with dictionary encoded data significantly increases the performance of `SELECT` queries for many applications. @@ -221,49 +221,49 @@ This column converts a longitude value into an X coordinate in the Mercator proj - Dividing by 360 normalizes this to a value between 0 and 1 - Multiplying by `0xFFFFFFFF` (hex for maximum 32-bit unsigned integer) scales this normalized value to the full range of a 32-bit integer -**mercator_y** + **mercator_y** -This column converts a latitude value into a Y coordinate in the Mercator projection: + This column converts a latitude value into a Y coordinate in the Mercator projection: - `latitude + 90` shifts latitude from [-90, 90] to [0, 180] - Dividing by 360 and multiplying by pi() converts to radians for the trigonometric functions - The `log(tan(...))` part is the core of the Mercator projection formula - multiplying by `0xFFFFFFFF` scales to the full 32-bit integer range -Specifying `MATERIALIZED` makes sure that ClickHouse calculates the values for these -columns when we `INSERT` the data, without having to specify these columns (which are not -part of the original data schema) in the `INSERT statement. + Specifying `MATERIALIZED` makes sure that ClickHouse calculates the values for these + columns when we `INSERT` the data, without having to specify these columns (which are not + part of the original data schema) in the `INSERT statement. -The table is ordered by `mortonEncode(mercator_x, mercator_y)` which produces a -Z-order space-filling curve of `mercator_x`, `mercator_y` in order to significantly -improve geospatial query performance. This Z-order curve ordering ensures data is -physically organized by spatial proximity: + The table is ordered by `mortonEncode(mercator_x, mercator_y)` which produces a + Z-order space-filling curve of `mercator_x`, `mercator_y` in order to significantly + improve geospatial query performance. This Z-order curve ordering ensures data is + physically organized by spatial proximity: -```sql -ORDER BY mortonEncode(mercator_x, mercator_y) -``` + ```sql + ORDER BY mortonEncode(mercator_x, mercator_y) + ``` -Two `minmax` indices are also created for faster search: + Two `minmax` indices are also created for faster search: -```sql -INDEX idx_x mercator_x TYPE minmax, -INDEX idx_y mercator_y TYPE minmax -``` + ```sql + INDEX idx_x mercator_x TYPE minmax, + INDEX idx_y mercator_y TYPE minmax + ``` -As you can see, ClickHouse has absolutely everything you need for real-time -mapping applications! + As you can see, ClickHouse has absolutely everything you need for real-time + mapping applications! -Run the following query to load the data: + Run the following query to load the data: -```sql -INSERT INTO foursquare_mercator -SELECT * FROM s3('s3://fsq-os-places-us-east-1/release/dt=2025-04-08/places/parquet/*') -``` + ```sql + INSERT INTO foursquare_mercator + SELECT * FROM s3('s3://fsq-os-places-us-east-1/release/dt=2025-04-08/places/parquet/*') + ``` ## Visualizing the data {#data-visualization} To see what's possible with this dataset, check out [adsb.exposed](https://adsb.exposed/?dataset=Places&zoom=5&lat=52.3488&lng=4.9219). -adsb.exposed was originally built by co-founder and CTO Alexey Milovidov to visualize ADS-B (Automatic Dependent Surveillance-Broadcast) +adsb.exposed was originally built by co-founder and CTO Alexey Milovidov to visualize ADS-B (Automatic Dependent Surveillance-Broadcast) flight data, which is 1000x times larger. During a company hackathon Alexey added the Foursquare data to the tool. Some of our favourite visualizations are produced here below for you to enjoy. @@ -275,4 +275,3 @@ Some of our favourite visualizations are produced here below for you to enjoy. ATMs Map of Europe with points of interest categorised by country - diff --git a/docs/getting-started/example-datasets/github.md b/docs/getting-started/example-datasets/github.md index e598f655766..3b907321af5 100644 --- a/docs/getting-started/example-datasets/github.md +++ b/docs/getting-started/example-datasets/github.md @@ -23,7 +23,7 @@ The generated data provides a `tsv` file for each of the following tables: - `file_changes` - files changed in every commit with the info about the change and statistics. - `line_changes` - every changed line in every changed file in every commit with full info about the line and the information about the previous change of this line. -As of November 8th, 2022, each TSV is approximately the following size and number of rows: + As of November 8th, 2022, each TSV is approximately the following size and number of rows: - `commits` - 7.8M - 266,051 rows - `file_changes` - 53M - 266,051 rows @@ -87,14 +87,14 @@ Generated files for the following repositories can be found below: - https://datasets-documentation.s3.amazonaws.com/github/commits/linux/file_changes.tsv.xz - 467MB - https://datasets-documentation.s3.amazonaws.com/github/commits/linux/line_changes.tsv.xz - 1.1G -To insert this data, prepare the database by executing the following queries: + To insert this data, prepare the database by executing the following queries: -```sql -DROP DATABASE IF EXISTS git; -CREATE DATABASE git; + ```sql + DROP DATABASE IF EXISTS git; + CREATE DATABASE git; -CREATE TABLE git.commits -( + CREATE TABLE git.commits + ( hash String, author LowCardinality(String), time DateTime, @@ -108,10 +108,10 @@ CREATE TABLE git.commits hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32 -) ENGINE = MergeTree ORDER BY time; + ) ENGINE = MergeTree ORDER BY time; -CREATE TABLE git.file_changes -( + CREATE TABLE git.file_changes + ( change_type Enum('Add' = 1, 'Delete' = 2, 'Modify' = 3, 'Rename' = 4, 'Copy' = 5, 'Type' = 6), path LowCardinality(String), old_path LowCardinality(String), @@ -135,10 +135,10 @@ CREATE TABLE git.file_changes commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32 -) ENGINE = MergeTree ORDER BY time; + ) ENGINE = MergeTree ORDER BY time; -CREATE TABLE git.line_changes -( + CREATE TABLE git.line_changes + ( sign Int8, line_number_old UInt32, line_number_new UInt32, @@ -179,38 +179,37 @@ CREATE TABLE git.line_changes commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32 -) ENGINE = MergeTree ORDER BY time; -``` - -Insert the data using `INSERT INTO SELECT` and the [s3 function](/sql-reference/table-functions/s3). For example, below, we insert the ClickHouse files into each of their respective tables: + ) ENGINE = MergeTree ORDER BY time; + ``` + Insert the data using `INSERT INTO SELECT` and the [s3 function](/sql-reference/table-functions/s3). For example, below, we insert the ClickHouse files into each of their respective tables: -*commits* + *commits* -```sql -INSERT INTO git.commits SELECT * -FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/commits.tsv.xz', 'TSV', 'hash String,author LowCardinality(String), time DateTime, message String, files_added UInt32, files_deleted UInt32, files_renamed UInt32, files_modified UInt32, lines_added UInt32, lines_deleted UInt32, hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32') + ```sql + INSERT INTO git.commits SELECT * + FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/commits.tsv.xz', 'TSV', 'hash String,author LowCardinality(String), time DateTime, message String, files_added UInt32, files_deleted UInt32, files_renamed UInt32, files_modified UInt32, lines_added UInt32, lines_deleted UInt32, hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32') -0 rows in set. Elapsed: 1.826 sec. Processed 62.78 thousand rows, 8.50 MB (34.39 thousand rows/s., 4.66 MB/s.) -``` + 0 rows in set. Elapsed: 1.826 sec. Processed 62.78 thousand rows, 8.50 MB (34.39 thousand rows/s., 4.66 MB/s.) + ``` -*file_changes* + *file_changes* -```sql -INSERT INTO git.file_changes SELECT * -FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/file_changes.tsv.xz', 'TSV', 'change_type Enum(\'Add\' = 1, \'Delete\' = 2, \'Modify\' = 3, \'Rename\' = 4, \'Copy\' = 5, \'Type\' = 6), path LowCardinality(String), old_path LowCardinality(String), file_extension LowCardinality(String), lines_added UInt32, lines_deleted UInt32, hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32, commit_hash String, author LowCardinality(String), time DateTime, commit_message String, commit_files_added UInt32, commit_files_deleted UInt32, commit_files_renamed UInt32, commit_files_modified UInt32, commit_lines_added UInt32, commit_lines_deleted UInt32, commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32') + ```sql + INSERT INTO git.file_changes SELECT * + FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/file_changes.tsv.xz', 'TSV', 'change_type Enum(\'Add\' = 1, \'Delete\' = 2, \'Modify\' = 3, \'Rename\' = 4, \'Copy\' = 5, \'Type\' = 6), path LowCardinality(String), old_path LowCardinality(String), file_extension LowCardinality(String), lines_added UInt32, lines_deleted UInt32, hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32, commit_hash String, author LowCardinality(String), time DateTime, commit_message String, commit_files_added UInt32, commit_files_deleted UInt32, commit_files_renamed UInt32, commit_files_modified UInt32, commit_lines_added UInt32, commit_lines_deleted UInt32, commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32') -0 rows in set. Elapsed: 2.688 sec. Processed 266.05 thousand rows, 48.30 MB (98.97 thousand rows/s., 17.97 MB/s.) -``` + 0 rows in set. Elapsed: 2.688 sec. Processed 266.05 thousand rows, 48.30 MB (98.97 thousand rows/s., 17.97 MB/s.) + ``` -*line_changes* + *line_changes* -```sql -INSERT INTO git.line_changes SELECT * -FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/line_changes.tsv.xz', 'TSV', ' sign Int8, line_number_old UInt32, line_number_new UInt32, hunk_num UInt32, hunk_start_line_number_old UInt32, hunk_start_line_number_new UInt32, hunk_lines_added UInt32,\n hunk_lines_deleted UInt32, hunk_context LowCardinality(String), line LowCardinality(String), indent UInt8, line_type Enum(\'Empty\' = 0, \'Comment\' = 1, \'Punct\' = 2, \'Code\' = 3), prev_commit_hash String, prev_author LowCardinality(String), prev_time DateTime, file_change_type Enum(\'Add\' = 1, \'Delete\' = 2, \'Modify\' = 3, \'Rename\' = 4, \'Copy\' = 5, \'Type\' = 6),\n path LowCardinality(String), old_path LowCardinality(String), file_extension LowCardinality(String), file_lines_added UInt32, file_lines_deleted UInt32, file_hunks_added UInt32, file_hunks_removed UInt32, file_hunks_changed UInt32, commit_hash String,\n author LowCardinality(String), time DateTime, commit_message String, commit_files_added UInt32, commit_files_deleted UInt32, commit_files_renamed UInt32, commit_files_modified UInt32, commit_lines_added UInt32, commit_lines_deleted UInt32, commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32') + ```sql + INSERT INTO git.line_changes SELECT * + FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/line_changes.tsv.xz', 'TSV', ' sign Int8, line_number_old UInt32, line_number_new UInt32, hunk_num UInt32, hunk_start_line_number_old UInt32, hunk_start_line_number_new UInt32, hunk_lines_added UInt32,\n hunk_lines_deleted UInt32, hunk_context LowCardinality(String), line LowCardinality(String), indent UInt8, line_type Enum(\'Empty\' = 0, \'Comment\' = 1, \'Punct\' = 2, \'Code\' = 3), prev_commit_hash String, prev_author LowCardinality(String), prev_time DateTime, file_change_type Enum(\'Add\' = 1, \'Delete\' = 2, \'Modify\' = 3, \'Rename\' = 4, \'Copy\' = 5, \'Type\' = 6),\n path LowCardinality(String), old_path LowCardinality(String), file_extension LowCardinality(String), file_lines_added UInt32, file_lines_deleted UInt32, file_hunks_added UInt32, file_hunks_removed UInt32, file_hunks_changed UInt32, commit_hash String,\n author LowCardinality(String), time DateTime, commit_message String, commit_files_added UInt32, commit_files_deleted UInt32, commit_files_renamed UInt32, commit_files_modified UInt32, commit_lines_added UInt32, commit_lines_deleted UInt32, commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32') -0 rows in set. Elapsed: 50.535 sec. Processed 7.54 million rows, 2.09 GB (149.11 thousand rows/s., 41.40 MB/s.) -``` + 0 rows in set. Elapsed: 50.535 sec. Processed 7.54 million rows, 2.09 GB (149.11 thousand rows/s., 41.40 MB/s.) + ``` ## Queries {#queries} @@ -256,7 +255,6 @@ LIMIT 10 10 rows in set. Elapsed: 0.006 sec. Processed 12.10 thousand rows, 1.60 MB (1.93 million rows/s., 255.40 MB/s.) ``` - We can also review the line changes, excluding renames i.e. we won't show changes before a rename event when the file existed under a different name: [play](https://sql.clickhouse.com?query_id=AKS9SYLARFMZCHGAAQNEBN) @@ -391,36 +389,36 @@ The difference here is caused by a few factors: - A rename can occur alongside other modifications to the file. These are listed as separate events in file_changes but with the same time. The `argMax` function has no way of distinguishing these - it picks the first value. The natural ordering of the inserts (the only means of knowing the correct order) is not maintained across the union so modified events can be selected. For example, below the `src/Functions/geometryFromColumn.h` file has several modifications before being renamed to `src/Functions/geometryConverters.h`. Our current solution may pick a Modify event as the latest change causing `src/Functions/geometryFromColumn.h` to be retained. -[play](https://sql.clickhouse.com?query_id=SCXWMR9GBMJ9UNZYQXQBFA) + [play](https://sql.clickhouse.com?query_id=SCXWMR9GBMJ9UNZYQXQBFA) -```sql - SELECT + ```sql + SELECT change_type, path, old_path, time, commit_hash - FROM git.file_changes - WHERE (path = 'src/Functions/geometryFromColumn.h') OR (old_path = 'src/Functions/geometryFromColumn.h') - - ┌─change_type─┬─path───────────────────────────────┬─old_path───────────────────────────┬────────────────time─┬─commit_hash──────────────────────────────┐ - │ Add │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 9376b676e9a9bb8911b872e1887da85a45f7479d │ - │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 6d59be5ea4768034f6526f7f9813062e0c369f7b │ - │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 33acc2aa5dc091a7cb948f78c558529789b2bad8 │ - │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 78e0db268ceadc42f82bc63a77ee1a4da6002463 │ - │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 14a891057d292a164c4179bfddaef45a74eaf83a │ - │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ d0d6e6953c2a2af9fb2300921ff96b9362f22edb │ - │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ fe8382521139a58c0ba277eb848e88894658db66 │ - │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 3be3d5cde8788165bc0558f1e2a22568311c3103 │ - │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ afad9bf4d0a55ed52a3f55483bc0973456e10a56 │ - │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ e3290ecc78ca3ea82b49ebcda22b5d3a4df154e6 │ - │ Rename │ src/Functions/geometryConverters.h │ src/Functions/geometryFromColumn.h │ 2021-03-11 12:08:16 │ 125945769586baf6ffd15919b29565b1b2a63218 │ - └─────────────┴────────────────────────────────────┴────────────────────────────────────┴─────────────────────┴──────────────────────────────────────────┘ - 11 rows in set. Elapsed: 0.030 sec. Processed 266.05 thousand rows, 6.61 MB (8.89 million rows/s., 220.82 MB/s.) -``` + FROM git.file_changes + WHERE (path = 'src/Functions/geometryFromColumn.h') OR (old_path = 'src/Functions/geometryFromColumn.h') + + ┌─change_type─┬─path───────────────────────────────┬─old_path───────────────────────────┬────────────────time─┬─commit_hash──────────────────────────────┐ + │ Add │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 9376b676e9a9bb8911b872e1887da85a45f7479d │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 6d59be5ea4768034f6526f7f9813062e0c369f7b │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 33acc2aa5dc091a7cb948f78c558529789b2bad8 │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 78e0db268ceadc42f82bc63a77ee1a4da6002463 │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 14a891057d292a164c4179bfddaef45a74eaf83a │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ d0d6e6953c2a2af9fb2300921ff96b9362f22edb │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ fe8382521139a58c0ba277eb848e88894658db66 │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ 3be3d5cde8788165bc0558f1e2a22568311c3103 │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ afad9bf4d0a55ed52a3f55483bc0973456e10a56 │ + │ Modify │ src/Functions/geometryFromColumn.h │ │ 2021-03-11 12:08:16 │ e3290ecc78ca3ea82b49ebcda22b5d3a4df154e6 │ + │ Rename │ src/Functions/geometryConverters.h │ src/Functions/geometryFromColumn.h │ 2021-03-11 12:08:16 │ 125945769586baf6ffd15919b29565b1b2a63218 │ + └─────────────┴────────────────────────────────────┴────────────────────────────────────┴─────────────────────┴──────────────────────────────────────────┘ + 11 rows in set. Elapsed: 0.030 sec. Processed 266.05 thousand rows, 6.61 MB (8.89 million rows/s., 220.82 MB/s.) + ``` - Broken commit history - missing delete events. Source and cause TBD. -These differences shouldn't meaningfully impact our analysis. **We welcome improved versions of this query**. + These differences shouldn't meaningfully impact our analysis. **We welcome improved versions of this query**. ### List files with most modifications {#list-files-with-most-modifications} @@ -1287,7 +1285,6 @@ A Sankey chart (SuperSet) allows this to be visualized nicely. Note we increase Superset authors matrix - Alexey clearly likes removing other peoples code. Lets exclude him for a more balanced view of code removal. Superset authors matrix v2 @@ -1542,7 +1539,6 @@ LIMIT 10 ### List files that were rewritten most number of times? {#list-files-that-were-rewritten-most-number-of-times} - The simplest approach to this question might be to simply count the most number of line modifications per path (restricted to current files) e.g.: ```sql diff --git a/docs/getting-started/example-datasets/laion.md b/docs/getting-started/example-datasets/laion.md index 9dab2bea84f..a7d3ad7a2bf 100644 --- a/docs/getting-started/example-datasets/laion.md +++ b/docs/getting-started/example-datasets/laion.md @@ -39,7 +39,7 @@ text_npy = "text_emb_" + str_i + '.npy' # load all files im_emb = np.load(npy_file) -text_emb = np.load(text_npy) +text_emb = np.load(text_npy) data = pd.read_parquet(metadata_file) # combine files diff --git a/docs/getting-started/example-datasets/noaa.md b/docs/getting-started/example-datasets/noaa.md index e2a24902617..06a61a88f26 100644 --- a/docs/getting-started/example-datasets/noaa.md +++ b/docs/getting-started/example-datasets/noaa.md @@ -81,12 +81,11 @@ $ clickhouse-local --query "SELECT * FROM '2021.csv.gz' LIMIT 10" --format Prett Summarizing the [format documentation](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): - Summarizing the format documentation and the columns in order: - - An 11 character station identification code. This itself encodes some useful information - - YEAR/MONTH/DAY = 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986) - - ELEMENT = 4 character indicator of element type. Effectively the measurement type. While there are many measurements available, we select the following: +- An 11 character station identification code. This itself encodes some useful information +- YEAR/MONTH/DAY = 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986) +- ELEMENT = 4 character indicator of element type. Effectively the measurement type. While there are many measurements available, we select the following: - PRCP - Precipitation (tenths of mm) - SNOW - Snowfall (mm) - SNWD - Snow depth (mm) @@ -103,14 +102,14 @@ Summarizing the format documentation and the columns in order: - S-FLAG is the source flag for the observation. Not useful for our analysis and ignored. - OBS-TIME = 4-character time of observation in hour-minute format (i.e. 0700 =7:00 am). Typically not present in older data. We ignore this for our purposes. -A measurement per line would result in a sparse table structure in ClickHouse. We should transform to a row per time and station, with measurements as columns. First, we limit the dataset to those rows without issues i.e. where `qFlag` is equal to an empty string. + A measurement per line would result in a sparse table structure in ClickHouse. We should transform to a row per time and station, with measurements as columns. First, we limit the dataset to those rows without issues i.e. where `qFlag` is equal to an empty string. #### Clean the data {#clean-the-data} Using [ClickHouse local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) we can filter rows that represent measurements of interest and pass our quality requirements: ```bash -clickhouse local --query "SELECT count() +clickhouse local --query "SELECT count() FROM file('*.csv.gz', CSV, 'station_id String, date String, measurement String, value Int64, mFlag String, qFlag String, sFlag String, obsTime String') WHERE qFlag = '' AND (measurement IN ('PRCP', 'SNOW', 'SNWD', 'TMAX', 'TAVG', 'TMIN', 'PSUN', 'AWND', 'WSFG') OR startsWith(measurement, 'WT'))" 2679264563 @@ -118,7 +117,6 @@ FROM file('*.csv.gz', CSV, 'station_id String, date String, measurement String, With over 2.6 billion rows, this isn't a fast query since it involves parsing all the files. On our 8 core machine, this takes around 160 seconds. - ### Pivot data {#pivot-data} While the measurement per line structure can be used with ClickHouse, it will unnecessarily complicate future queries. Ideally, we need a row per station id and date, where each measurement type and associated value are a column i.e. @@ -184,7 +182,7 @@ SELECT station_id, name FROM file('noaa.csv', CSV, 'station_id String, date Date32, tempAvg Int32, tempMax Int32, tempMin Int32, precipitation Int32, snowfall Int32, snowDepth Int32, percentDailySun Int8, averageWindSpeed Int32, maxWindSpeed Int32, weatherType UInt8') as noaa LEFT OUTER - JOIN stations ON noaa.station_id = stations.id INTO OUTFILE 'noaa_enriched.parquet' FORMAT Parquet SETTINGS format_regexp='^(.{11})\s+(\-?\d{1,2}\.\d{4})\s+(\-?\d{1,3}\.\d{1,4})\s+(\-?\d*\.\d*)\s+(.*)\s+(?:[\d]*)'" + JOIN stations ON noaa.station_id = stations.id INTO OUTFILE 'noaa_enriched.parquet' FORMAT Parquet SETTINGS format_regexp='^(.{11})\s+(\-?\d{1,2}\.\d{4})\s+(\-?\d{1,3}\.\d{1,4})\s+(\-?\d*\.\d*)\s+(.*)\s+(?:[\d]*)'" ``` This query takes a few minutes to run and produces a 6.4 GB file, `noaa_enriched.parquet`. @@ -224,7 +222,7 @@ Data can be inserted from a local file as follows (from the ClickHouse client): INSERT INTO noaa FROM INFILE '/noaa_enriched.parquet' ``` -where `` represents the full path to the local file on disk. +where `` represents the full path to the local file on disk. See [here](https://clickhouse.com/blog/real-world-data-noaa-climate-data#load-the-data) for how to speed this load up. diff --git a/docs/getting-started/example-datasets/nyc-taxi.md b/docs/getting-started/example-datasets/nyc-taxi.md index 2471a4137bb..6029d00377c 100644 --- a/docs/getting-started/example-datasets/nyc-taxi.md +++ b/docs/getting-started/example-datasets/nyc-taxi.md @@ -18,12 +18,10 @@ The full dataset can be obtained in a couple of ways: - download prepared partitions - Alternatively users can query the full dataset in our demo environment at [sql.clickhouse.com](https://sql.clickhouse.com/?query=U0VMRUNUIGNvdW50KCkgRlJPTSBueWNfdGF4aS50cmlwcw&chart=eyJ0eXBlIjoibGluZSIsImNvbmZpZyI6eyJ0aXRsZSI6IlRlbXBlcmF0dXJlIGJ5IGNvdW50cnkgYW5kIHllYXIiLCJ4YXhpcyI6InllYXIiLCJ5YXhpcyI6ImNvdW50KCkiLCJzZXJpZXMiOiJDQVNUKHBhc3Nlbmdlcl9jb3VudCwgJ1N0cmluZycpIn19). - -:::note -The example queries below were executed on a **Production** instance of ClickHouse Cloud. For more information see -["Playground specifications"](/getting-started/playground#specifications). -::: - + :::note + The example queries below were executed on a **Production** instance of ClickHouse Cloud. For more information see + ["Playground specifications"](/getting-started/playground#specifications). + ::: ## Create the table trips {#create-the-table-trips} @@ -59,68 +57,64 @@ PRIMARY KEY (pickup_datetime, dropoff_datetime); ## Load the data directly from object storage {#load-the-data-directly-from-object-storage} Users' can grab a small subset of the data (3 million rows) for getting familiar with it. The data is in TSV files in object storage, which is easily streamed into -ClickHouse Cloud using the `s3` table function. +ClickHouse Cloud using the `s3` table function. The same data is stored in both S3 and GCS; choose either tab. - The following command streams three files from an S3 bucket into the `trips_small` table (the `{0..2}` syntax is a wildcard for the values 0, 1, and 2): - ```sql INSERT INTO nyc_taxi.trips_small SELECT - trip_id, - pickup_datetime, - dropoff_datetime, - pickup_longitude, - pickup_latitude, - dropoff_longitude, - dropoff_latitude, - passenger_count, - trip_distance, - fare_amount, - extra, - tip_amount, - tolls_amount, - total_amount, - payment_type, - pickup_ntaname, - dropoff_ntaname +trip_id, +pickup_datetime, +dropoff_datetime, +pickup_longitude, +pickup_latitude, +dropoff_longitude, +dropoff_latitude, +passenger_count, +trip_distance, +fare_amount, +extra, +tip_amount, +tolls_amount, +total_amount, +payment_type, +pickup_ntaname, +dropoff_ntaname FROM s3( - 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_{0..2}.gz', - 'TabSeparatedWithNames' +'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_{0..2}.gz', +'TabSeparatedWithNames' ); ``` - The following command streams three files from a GCS bucket into the `trips` table (the `{0..2}` syntax is a wildcard for the values 0, 1, and 2): - ```sql INSERT INTO nyc_taxi.trips_small SELECT - trip_id, - pickup_datetime, - dropoff_datetime, - pickup_longitude, - pickup_latitude, - dropoff_longitude, - dropoff_latitude, - passenger_count, - trip_distance, - fare_amount, - extra, - tip_amount, - tolls_amount, - total_amount, - payment_type, - pickup_ntaname, - dropoff_ntaname +trip_id, +pickup_datetime, +dropoff_datetime, +pickup_longitude, +pickup_latitude, +dropoff_longitude, +dropoff_latitude, +passenger_count, +trip_distance, +fare_amount, +extra, +tip_amount, +tolls_amount, +total_amount, +payment_type, +pickup_ntaname, +dropoff_ntaname FROM gcs( - 'https://storage.googleapis.com/clickhouse-public-datasets/nyc-taxi/trips_{0..2}.gz', - 'TabSeparatedWithNames' +'https://storage.googleapis.com/clickhouse-public-datasets/nyc-taxi/trips_{0..2}.gz', +'TabSeparatedWithNames' ); ``` @@ -147,7 +141,6 @@ LIMIT 10; Notice there are columns for the pickup and dropoff dates, geo coordinates, fare details, New York neighborhoods, and more. - Let's run a few queries. This query shows us the top 10 neighborhoods that have the most frequent pickups: ```sql runnable diff --git a/docs/getting-started/example-datasets/nypd_complaint_data.md b/docs/getting-started/example-datasets/nypd_complaint_data.md index 71412ea946d..b21b1e5ff87 100644 --- a/docs/getting-started/example-datasets/nypd_complaint_data.md +++ b/docs/getting-started/example-datasets/nypd_complaint_data.md @@ -14,10 +14,10 @@ While working through this guide you will: - **Preprocess and stream** the data to ClickHouse. - **Run some queries** against ClickHouse. -The dataset used in this guide comes from the NYC Open Data team, and contains data about "all valid felony, misdemeanor, and violation crimes reported to the New York City Police Department (NYPD)". At the time of writing, the data file is 166MB, but it is updated regularly. + The dataset used in this guide comes from the NYC Open Data team, and contains data about "all valid felony, misdemeanor, and violation crimes reported to the New York City Police Department (NYPD)". At the time of writing, the data file is 166MB, but it is updated regularly. -**Source**: [data.cityofnewyork.us](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) -**Terms of use**: https://www1.nyc.gov/home/terms-of-use.page + **Source**: [data.cityofnewyork.us](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) + **Terms of use**: https://www1.nyc.gov/home/terms-of-use.page ## Prerequisites {#prerequisites} - Download the dataset by visiting the [NYPD Complaint Data Current (Year To Date)](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) page, clicking the Export button, and choosing **TSV for Excel**. @@ -28,9 +28,9 @@ There are two types of commands in this guide: - Some of the commands are querying the TSV files, these are run at the command prompt. - The rest of the commands are querying ClickHouse, and these are run in the `clickhouse-client` or Play UI. -:::note -The examples in this guide assume that you have saved the TSV file to `${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`, please adjust the commands if needed. -::: + :::note + The examples in this guide assume that you have saved the TSV file to `${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`, please adjust the commands if needed. + ::: ## Familiarize yourself with the TSV file {#familiarize-yourself-with-the-tsv-file} @@ -271,9 +271,9 @@ Based on the above investigation: - Dates and times can be concatenated into DateTime types - There are some dates before January 1st 1970, which means we need a 64 bit DateTime -:::note -There are many more changes to be made to the types, they all can be determined by following the same investigation steps. Look at the number of distinct strings in a field, the min and max of the numerics, and make your decisions. The table schema that is given later in the guide has many low cardinality strings and unsigned integer fields and very few floating point numerics. -::: + :::note + There are many more changes to be made to the types, they all can be determined by following the same investigation steps. Look at the number of distinct strings in a field, the min and max of the numerics, and make your decisions. The table schema that is given later in the guide has many low cardinality strings and unsigned integer fields and very few floating point numerics. + ::: ## Concatenate the date and time fields {#concatenate-the-date-and-time-fields} @@ -374,52 +374,51 @@ of this document. - The primary key index is created using the `PRIMARY KEY` tuple if specified, otherwise the `ORDER BY` tuple - The `PRIMARY KEY` index is kept in main memory -Looking at the dataset and the questions that might be answered by querying it we might -decide that we would look at the types of crimes reported over time in the five boroughs of -New York City. These fields might be then included in the `ORDER BY`: - -| Column | Description (from the data dictionary) | -| ----------- | --------------------------------------------------- | -| OFNS_DESC | Description of offense corresponding with key code | -| RPT_DT | Date event was reported to police | -| BORO_NM | The name of the borough in which the incident occurred | + Looking at the dataset and the questions that might be answered by querying it we might + decide that we would look at the types of crimes reported over time in the five boroughs of + New York City. These fields might be then included in the `ORDER BY`: + | Column | Description (from the data dictionary) | + | ----------- | --------------------------------------------------- | + | OFNS_DESC | Description of offense corresponding with key code | + | RPT_DT | Date event was reported to police | + | BORO_NM | The name of the borough in which the incident occurred | -Querying the TSV file for the cardinality of the three candidate columns: + Querying the TSV file for the cardinality of the three candidate columns: -```bash -clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \ ---query \ -"select formatReadableQuantity(uniq(OFNS_DESC)) as cardinality_OFNS_DESC, + ```bash + clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \ + --query \ + "select formatReadableQuantity(uniq(OFNS_DESC)) as cardinality_OFNS_DESC, formatReadableQuantity(uniq(RPT_DT)) as cardinality_RPT_DT, formatReadableQuantity(uniq(BORO_NM)) as cardinality_BORO_NM - FROM - file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames') - FORMAT PrettyCompact" -``` - -Result: -```response -┌─cardinality_OFNS_DESC─┬─cardinality_RPT_DT─┬─cardinality_BORO_NM─┐ -│ 60.00 │ 306.00 │ 6.00 │ -└───────────────────────┴────────────────────┴─────────────────────┘ -``` -Ordering by cardinality, the `ORDER BY` becomes: - -```sql -ORDER BY ( BORO_NM, OFNS_DESC, RPT_DT ) -``` -:::note -The table below will use more easily read column names, the above names will be mapped to -```sql -ORDER BY ( borough, offense_description, date_reported ) -``` -::: - -Putting together the changes to data types and the `ORDER BY` tuple gives this table structure: - -```sql -CREATE TABLE NYPD_Complaint ( + FROM + file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames') + FORMAT PrettyCompact" + ``` + + Result: + ```response + ┌─cardinality_OFNS_DESC─┬─cardinality_RPT_DT─┬─cardinality_BORO_NM─┐ + │ 60.00 │ 306.00 │ 6.00 │ + └───────────────────────┴────────────────────┴─────────────────────┘ + ``` + Ordering by cardinality, the `ORDER BY` becomes: + + ```sql + ORDER BY ( BORO_NM, OFNS_DESC, RPT_DT ) + ``` + :::note + The table below will use more easily read column names, the above names will be mapped to + ```sql + ORDER BY ( borough, offense_description, date_reported ) + ``` + ::: + + Putting together the changes to data types and the `ORDER BY` tuple gives this table structure: + + ```sql + CREATE TABLE NYPD_Complaint ( complaint_number String, precinct UInt8, borough LowCardinality(String), @@ -452,9 +451,9 @@ CREATE TABLE NYPD_Complaint ( NY_y_coordinate UInt32, Latitude Float64, Longitude Float64 -) ENGINE = MergeTree - ORDER BY ( borough, offense_description, date_reported ) -``` + ) ENGINE = MergeTree + ORDER BY ( borough, offense_description, date_reported ) + ``` ### Finding the primary key of a table {#finding-the-primary-key-of-a-table} @@ -579,7 +578,6 @@ Result: └─────────────────────────────────┘ ``` - ## Run some queries {#run-queries} ### Query 1. Compare the number of complaints by month {#query-1-compare-the-number-of-complaints-by-month} diff --git a/docs/getting-started/example-datasets/stackoverflow.md b/docs/getting-started/example-datasets/stackoverflow.md index 1baae46896e..226fe71f3ab 100644 --- a/docs/getting-started/example-datasets/stackoverflow.md +++ b/docs/getting-started/example-datasets/stackoverflow.md @@ -70,7 +70,6 @@ INSERT INTO stackoverflow.posts SELECT * FROM s3('https://datasets-documentation Posts are also available by year e.g. [https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2020.parquet](https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2020.parquet) - ### Votes {#votes} ```sql @@ -93,7 +92,6 @@ INSERT INTO stackoverflow.votes SELECT * FROM s3('https://datasets-documentation Votes are also available by year e.g. [https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2020.parquet](https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/2020.parquet) - ### Comments {#comments} ```sql @@ -237,7 +235,7 @@ sudo apt install jq pip install yq ``` -The following steps apply to any of the above files. We use the `stackoverflow.com-Posts.7z` file as an example. Modify as required. +The following steps apply to any of the above files. We use the `stackoverflow.com-Posts.7z` file as an example. Modify as required. Extract the file using [p7zip](https://p7zip.sourceforge.net/). This will produce a single xml file - in this case `Posts.xml`. @@ -262,7 +260,7 @@ After running the above users will have a set of files, each with 10000 lines. T find . -maxdepth 1 -type f -exec xq -c '.rows.row[]' {} \; | sed -e 's:"@:":g' > posts_v2.json ``` -The above command will produce a single `posts.json` file. +The above command will produce a single `posts.json` file. Load into ClickHouse with the following command. Note the schema is specified for the `posts.json` file. This will need to be adjusted per data type to align with the target table. diff --git a/docs/getting-started/example-datasets/star-schema.md b/docs/getting-started/example-datasets/star-schema.md index d7874aeef85..ae6e177c141 100644 --- a/docs/getting-started/example-datasets/star-schema.md +++ b/docs/getting-started/example-datasets/star-schema.md @@ -13,29 +13,29 @@ References: - [Star Schema Benchmark](https://cs.umb.edu/~poneil/StarSchemaB.pdf) (O'Neil et. al), 2009 - [Variations of the Star Schema Benchmark to Test the Effects of Data Skew on Query Performance](https://doi.org/10.1145/2479871.2479927) (Rabl. et. al.), 2013 -First, checkout the star schema benchmark repository and compile the data generator: + First, checkout the star schema benchmark repository and compile the data generator: -```bash -git clone https://github.com/vadimtk/ssb-dbgen.git -cd ssb-dbgen -make -``` + ```bash + git clone https://github.com/vadimtk/ssb-dbgen.git + cd ssb-dbgen + make + ``` -Then, generate the data. Parameter `-s` specifies the scale factor. For example, with `-s 100`, 600 million rows are generated. + Then, generate the data. Parameter `-s` specifies the scale factor. For example, with `-s 100`, 600 million rows are generated. -```bash -./dbgen -s 1000 -T c -./dbgen -s 1000 -T l -./dbgen -s 1000 -T p -./dbgen -s 1000 -T s -./dbgen -s 1000 -T d -``` + ```bash + ./dbgen -s 1000 -T c + ./dbgen -s 1000 -T l + ./dbgen -s 1000 -T p + ./dbgen -s 1000 -T s + ./dbgen -s 1000 -T d + ``` -Now create tables in ClickHouse: + Now create tables in ClickHouse: -```sql -CREATE TABLE customer -( + ```sql + CREATE TABLE customer + ( C_CUSTKEY UInt32, C_NAME String, C_ADDRESS String, @@ -44,11 +44,11 @@ CREATE TABLE customer C_REGION LowCardinality(String), C_PHONE String, C_MKTSEGMENT LowCardinality(String) -) -ENGINE = MergeTree ORDER BY (C_CUSTKEY); + ) + ENGINE = MergeTree ORDER BY (C_CUSTKEY); -CREATE TABLE lineorder -( + CREATE TABLE lineorder + ( LO_ORDERKEY UInt32, LO_LINENUMBER UInt8, LO_CUSTKEY UInt32, @@ -66,11 +66,11 @@ CREATE TABLE lineorder LO_TAX UInt8, LO_COMMITDATE Date, LO_SHIPMODE LowCardinality(String) -) -ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY); + ) + ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY); -CREATE TABLE part -( + CREATE TABLE part + ( P_PARTKEY UInt32, P_NAME String, P_MFGR LowCardinality(String), @@ -80,11 +80,11 @@ CREATE TABLE part P_TYPE LowCardinality(String), P_SIZE UInt8, P_CONTAINER LowCardinality(String) -) -ENGINE = MergeTree ORDER BY P_PARTKEY; + ) + ENGINE = MergeTree ORDER BY P_PARTKEY; -CREATE TABLE supplier -( + CREATE TABLE supplier + ( S_SUPPKEY UInt32, S_NAME String, S_ADDRESS String, @@ -92,11 +92,11 @@ CREATE TABLE supplier S_NATION LowCardinality(String), S_REGION LowCardinality(String), S_PHONE String -) -ENGINE = MergeTree ORDER BY S_SUPPKEY; + ) + ENGINE = MergeTree ORDER BY S_SUPPKEY; -CREATE TABLE date -( + CREATE TABLE date + ( D_DATEKEY Date, D_DATE FixedString(18), D_DAYOFWEEK LowCardinality(String), @@ -114,29 +114,29 @@ CREATE TABLE date D_LASTDAYINMONTHFL UInt8, D_HOLIDAYFL UInt8, D_WEEKDAYFL UInt8 -) -ENGINE = MergeTree ORDER BY D_DATEKEY; -``` + ) + ENGINE = MergeTree ORDER BY D_DATEKEY; + ``` -The data can be imported as follows: + The data can be imported as follows: -```bash -clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl -clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl -clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl -clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl -clickhouse-client --query "INSERT INTO date FORMAT CSV" < date.tbl -``` + ```bash + clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl + clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl + clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl + clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl + clickhouse-client --query "INSERT INTO date FORMAT CSV" < date.tbl + ``` -In many use cases of ClickHouse, multiple tables are converted into a single denormalized flat table. -This step is optional, below queries are listed in their original form and in a format rewritten for the denormalized table. + In many use cases of ClickHouse, multiple tables are converted into a single denormalized flat table. + This step is optional, below queries are listed in their original form and in a format rewritten for the denormalized table. -```sql -SET max_memory_usage = 20000000000; + ```sql + SET max_memory_usage = 20000000000; -CREATE TABLE lineorder_flat -ENGINE = MergeTree ORDER BY (LO_ORDERDATE, LO_ORDERKEY) -AS SELECT + CREATE TABLE lineorder_flat + ENGINE = MergeTree ORDER BY (LO_ORDERDATE, LO_ORDERKEY) + AS SELECT l.LO_ORDERKEY AS LO_ORDERKEY, l.LO_LINENUMBER AS LO_LINENUMBER, l.LO_CUSTKEY AS LO_CUSTKEY, @@ -175,349 +175,349 @@ AS SELECT p.P_TYPE AS P_TYPE, p.P_SIZE AS P_SIZE, p.P_CONTAINER AS P_CONTAINER -FROM lineorder AS l -INNER JOIN customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY -INNER JOIN supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY -INNER JOIN part AS p ON p.P_PARTKEY = l.LO_PARTKEY; -``` + FROM lineorder AS l + INNER JOIN customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY + INNER JOIN supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY + INNER JOIN part AS p ON p.P_PARTKEY = l.LO_PARTKEY; + ``` -The queries are generated by `./qgen -s `. Example queries for `s = 100`: + The queries are generated by `./qgen -s `. Example queries for `s = 100`: -Q1.1 + Q1.1 -```sql -SELECT + ```sql + SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS REVENUE -FROM + FROM lineorder, date -WHERE + WHERE LO_ORDERDATE = D_DATEKEY AND D_YEAR = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue -FROM + FROM lineorder_flat -WHERE + WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; -``` + ``` -Q1.2 + Q1.2 -```sql -SELECT + ```sql + SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS REVENUE -FROM + FROM lineorder, date -WHERE + WHERE LO_ORDERDATE = D_DATEKEY AND D_YEARMONTHNUM = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue -FROM + FROM lineorder_flat -WHERE + WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; -``` + ``` -Q1.3 + Q1.3 -```sql -SELECT + ```sql + SELECT sum(LO_EXTENDEDPRICE*LO_DISCOUNT) AS REVENUE -FROM + FROM lineorder, date -WHERE + WHERE LO_ORDERDATE = D_DATEKEY AND D_WEEKNUMINYEAR = 6 AND D_YEAR = 1994 AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue -FROM + FROM lineorder_flat -WHERE + WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; -``` + ``` -Q2.1 + Q2.1 -```sql -SELECT + ```sql + SELECT sum(LO_REVENUE), D_YEAR, P_BRAND -FROM + FROM lineorder, date, part, supplier -WHERE + WHERE LO_ORDERDATE = D_DATEKEY AND LO_PARTKEY = P_PARTKEY AND LO_SUPPKEY = S_SUPPKEY AND P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' -GROUP BY + GROUP BY D_YEAR, P_BRAND -ORDER BY + ORDER BY D_YEAR, P_BRAND; -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND -FROM lineorder_flat -WHERE + FROM lineorder_flat + WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' -GROUP BY + GROUP BY year, P_BRAND -ORDER BY + ORDER BY year, P_BRAND; -``` + ``` -Q2.2 + Q2.2 -```sql -SELECT + ```sql + SELECT sum(LO_REVENUE), D_YEAR, P_BRAND -FROM + FROM lineorder, date, part, supplier -WHERE + WHERE LO_ORDERDATE = D_DATEKEY AND LO_PARTKEY = P_PARTKEY AND LO_SUPPKEY = S_SUPPKEY AND P_BRAND BETWEEN 'MFGR#2221' AND 'MFGR#2228' AND S_REGION = 'ASIA' -GROUP BY + GROUP BY D_YEAR, P_BRAND -ORDER BY + ORDER BY D_YEAR, P_BRAND; -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND -FROM lineorder_flat -WHERE P_BRAND >= 'MFGR#2221' AND P_BRAND <= 'MFGR#2228' AND S_REGION = 'ASIA' -GROUP BY + FROM lineorder_flat + WHERE P_BRAND >= 'MFGR#2221' AND P_BRAND <= 'MFGR#2228' AND S_REGION = 'ASIA' + GROUP BY year, P_BRAND -ORDER BY + ORDER BY year, P_BRAND; -``` + ``` -Q2.3 + Q2.3 -```sql -SELECT + ```sql + SELECT sum(LO_REVENUE), D_YEAR, P_BRAND -FROM + FROM lineorder, date, part, supplier -WHERE + WHERE LO_ORDERDATE = D_DATEKEY AND LO_PARTKEY = P_PARTKEY AND LO_SUPPKEY = S_SUPPKEY AND P_BRAND = 'MFGR#2221' AND S_REGION = 'EUROPE' -GROUP BY + GROUP BY D_YEAR, P_BRAND -ORDER BY + ORDER BY D_YEAR, P_BRAND; -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND -FROM lineorder_flat -WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' -GROUP BY + FROM lineorder_flat + WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' + GROUP BY year, P_BRAND -ORDER BY + ORDER BY year, P_BRAND; -``` + ``` -Q3.1 + Q3.1 -```sql -SELECT + ```sql + SELECT C_NATION, S_NATION, D_YEAR, sum(LO_REVENUE) AS REVENUE -FROM + FROM customer, lineorder, supplier, date -WHERE + WHERE LO_CUSTKEY = C_CUSTKEY AND LO_SUPPKEY = S_SUPPKEY AND LO_ORDERDATE = D_DATEKEY AND C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND D_YEAR >= 1992 AND D_YEAR <= 1997 -GROUP BY + GROUP BY C_NATION, S_NATION, D_YEAR -ORDER BY + ORDER BY D_YEAR ASC, REVENUE DESC; -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT C_NATION, S_NATION, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue -FROM lineorder_flat -WHERE + FROM lineorder_flat + WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 -GROUP BY + GROUP BY C_NATION, S_NATION, year -ORDER BY + ORDER BY year ASC, revenue DESC; -``` + ``` -Q3.2 + Q3.2 -```sql -SELECT + ```sql + SELECT C_CITY, S_CITY, D_YEAR, sum(LO_REVENUE) AS REVENUE -FROM + FROM customer, lineorder, supplier, date -WHERE + WHERE LO_CUSTKEY = C_CUSTKEY AND LO_SUPPKEY = S_SUPPKEY AND LO_ORDERDATE = D_DATEKEY AND C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND D_YEAR >= 1992 AND D_YEAR <= 1997 -GROUP BY + GROUP BY C_CITY, S_CITY, D_YEAR -ORDER BY + ORDER BY D_YEAR ASC, REVENUE DESC; -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue -FROM lineorder_flat -WHERE + FROM lineorder_flat + WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 -GROUP BY + GROUP BY C_CITY, S_CITY, year -ORDER BY + ORDER BY year ASC, revenue DESC; -``` + ``` -Q3.3 + Q3.3 -```sql -SELECT + ```sql + SELECT C_CITY, S_CITY, D_YEAR, sum(LO_REVENUE) AS revenue -FROM + FROM customer, lineorder, supplier, date -WHERE + WHERE LO_CUSTKEY = C_CUSTKEY AND LO_SUPPKEY = S_SUPPKEY AND LO_ORDERDATE = D_DATEKEY @@ -525,103 +525,103 @@ WHERE AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND D_YEAR >= 1992 AND D_YEAR <= 1997 -GROUP BY + GROUP BY C_CITY, S_CITY, D_YEAR -ORDER BY + ORDER BY D_YEAR ASC, revenue DESC; -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue -FROM lineorder_flat -WHERE + FROM lineorder_flat + WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 -GROUP BY + GROUP BY C_CITY, S_CITY, year -ORDER BY + ORDER BY year ASC, revenue DESC; -``` + ``` -Q3.4 + Q3.4 -```sql -SELECT + ```sql + SELECT C_CITY, S_CITY, D_YEAR, sum(LO_REVENUE) AS revenue -FROM + FROM customer, lineorder, supplier, date -WHERE + WHERE LO_CUSTKEY = C_CUSTKEY AND LO_SUPPKEY = S_SUPPKEY AND LO_ORDERDATE = D_DATEKEY AND (C_CITY='UNITED KI1' OR C_CITY='UNITED KI5') AND (S_CITY='UNITED KI1' OR S_CITY='UNITED KI5') AND D_YEARMONTH = 'Dec1997' -GROUP BY + GROUP BY C_CITY, S_CITY, D_YEAR -ORDER BY + ORDER BY D_YEAR ASC, revenue DESC; -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue -FROM lineorder_flat -WHERE + FROM lineorder_flat + WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = 199712 -GROUP BY + GROUP BY C_CITY, S_CITY, year -ORDER BY + ORDER BY year ASC, revenue DESC; -``` + ``` -Q4.1 + Q4.1 -```sql -SELECT + ```sql + SELECT D_YEAR, C_NATION, sum(LO_REVENUE - LO_SUPPLYCOST) AS PROFIT -FROM + FROM date, customer, supplier, part, lineorder -WHERE + WHERE LO_CUSTKEY = C_CUSTKEY AND LO_SUPPKEY = S_SUPPKEY AND LO_PARTKEY = P_PARTKEY @@ -629,46 +629,46 @@ WHERE AND C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') -GROUP BY + GROUP BY D_YEAR, C_NATION -ORDER BY + ORDER BY D_YEAR, C_NATION -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT toYear(LO_ORDERDATE) AS year, C_NATION, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit -FROM lineorder_flat -WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') -GROUP BY + FROM lineorder_flat + WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') + GROUP BY year, C_NATION -ORDER BY + ORDER BY year ASC, C_NATION ASC; -``` + ``` -Q4.2 + Q4.2 -```sql -SELECT + ```sql + SELECT D_YEAR, S_NATION, P_CATEGORY, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit -FROM + FROM date, customer, supplier, part, lineorder -WHERE + WHERE LO_CUSTKEY = C_CUSTKEY AND LO_SUPPKEY = S_SUPPKEY AND LO_PARTKEY = P_PARTKEY @@ -677,55 +677,55 @@ WHERE AND S_REGION = 'AMERICA' AND (D_YEAR = 1997 OR D_YEAR = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') -GROUP BY + GROUP BY D_YEAR, S_NATION, P_CATEGORY -ORDER BY + ORDER BY D_YEAR, S_NATION, P_CATEGORY -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT toYear(LO_ORDERDATE) AS year, S_NATION, P_CATEGORY, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit -FROM lineorder_flat -WHERE + FROM lineorder_flat + WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') -GROUP BY + GROUP BY year, S_NATION, P_CATEGORY -ORDER BY + ORDER BY year ASC, S_NATION ASC, P_CATEGORY ASC; -``` + ``` -Q4.3 + Q4.3 -```sql -SELECT + ```sql + SELECT D_YEAR, S_CITY, P_BRAND, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit -FROM + FROM date, customer, supplier, part, lineorder -WHERE + WHERE LO_CUSTKEY = C_CUSTKEY AND LO_SUPPKEY = S_SUPPKEY AND LO_PARTKEY = P_PARTKEY @@ -734,37 +734,36 @@ WHERE AND S_NATION = 'UNITED STATES' AND (D_YEAR = 1997 OR D_YEAR = 1998) AND P_CATEGORY = 'MFGR#14' -GROUP BY + GROUP BY D_YEAR, S_CITY, P_BRAND -ORDER BY + ORDER BY D_YEAR, S_CITY, P_BRAND -``` + ``` -Denormalized table: + Denormalized table: -```sql -SELECT + ```sql + SELECT toYear(LO_ORDERDATE) AS year, S_CITY, P_BRAND, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit -FROM + FROM lineorder_flat -WHERE + WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' -GROUP BY + GROUP BY year, S_CITY, P_BRAND -ORDER BY + ORDER BY year ASC, S_CITY ASC, P_BRAND ASC; -``` - + ``` diff --git a/docs/getting-started/example-datasets/tpcds.md b/docs/getting-started/example-datasets/tpcds.md index 4577d94e5cd..b146639f483 100644 --- a/docs/getting-started/example-datasets/tpcds.md +++ b/docs/getting-started/example-datasets/tpcds.md @@ -12,31 +12,31 @@ It includes 99 reporting and ad-hoc queries with random substitutions. References - [The Making of TPC-DS](https://dl.acm.org/doi/10.5555/1182635.1164217) (Nambiar), 2006 -First, checkout the TPC-DS repository and compile the data generator: + First, checkout the TPC-DS repository and compile the data generator: -```bash -git clone https://github.com/gregrahn/tpcds-kit.git -cd tpcds-kit/tools -make -``` + ```bash + git clone https://github.com/gregrahn/tpcds-kit.git + cd tpcds-kit/tools + make + ``` -Then, generate the data. Parameter `-scale` specifies the scale factor. + Then, generate the data. Parameter `-scale` specifies the scale factor. -```bash -./dsdgen -scale 1 -``` + ```bash + ./dsdgen -scale 1 + ``` -Then, generate the queries (use the same scale factor): + Then, generate the queries (use the same scale factor): -```bash -./dsqgen -DIRECTORY ../query_templates/ -INPUT ../query_templates/templates.lst -SCALE 1 # generates 99 queries in out/query_0.sql -``` + ```bash + ./dsqgen -DIRECTORY ../query_templates/ -INPUT ../query_templates/templates.lst -SCALE 1 # generates 99 queries in out/query_0.sql + ``` -Now create tables in ClickHouse. -You can either use the original table definitions in tools/tpcds.sql or "tuned" table definitions with properly defined primary key indexes and LowCardinality-type column types where it makes sense. + Now create tables in ClickHouse. + You can either use the original table definitions in tools/tpcds.sql or "tuned" table definitions with properly defined primary key indexes and LowCardinality-type column types where it makes sense. -```sql -CREATE TABLE call_center( + ```sql + CREATE TABLE call_center( cc_call_center_sk Int64, cc_call_center_id LowCardinality(String), cc_rec_start_date Nullable(Date), @@ -69,9 +69,9 @@ CREATE TABLE call_center( cc_gmt_offset Decimal(7,2), cc_tax_percentage Decimal(7,2), PRIMARY KEY (cc_call_center_sk) -); + ); -CREATE TABLE catalog_page( + CREATE TABLE catalog_page( cp_catalog_page_sk Int64, cp_catalog_page_id LowCardinality(String), cp_start_date_sk Nullable(UInt32), @@ -82,9 +82,9 @@ CREATE TABLE catalog_page( cp_description LowCardinality(Nullable(String)), cp_type LowCardinality(Nullable(String)), PRIMARY KEY (cp_catalog_page_sk) -); + ); -CREATE TABLE catalog_returns( + CREATE TABLE catalog_returns( cr_returned_date_sk Int32, cr_returned_time_sk Int64, cr_item_sk Int64, @@ -113,9 +113,9 @@ CREATE TABLE catalog_returns( cr_store_credit Nullable(Decimal(7,2)), cr_net_loss Nullable(Decimal(7,2)), PRIMARY KEY (cr_item_sk, cr_order_number) -); + ); -CREATE TABLE catalog_sales ( + CREATE TABLE catalog_sales ( cs_sold_date_sk Nullable(UInt32), cs_sold_time_sk Nullable(Int64), cs_ship_date_sk Nullable(UInt32), @@ -151,9 +151,9 @@ CREATE TABLE catalog_sales ( cs_net_paid_inc_ship_tax Nullable(Decimal(7,2)), cs_net_profit Decimal(7,2), PRIMARY KEY (cs_item_sk, cs_order_number) -); + ); -CREATE TABLE customer_address ( + CREATE TABLE customer_address ( ca_address_sk Int64, ca_address_id LowCardinality(String), ca_street_number LowCardinality(Nullable(String)), @@ -168,9 +168,9 @@ CREATE TABLE customer_address ( ca_gmt_offset Nullable(Decimal(7,2)), ca_location_type LowCardinality(Nullable(String)), PRIMARY KEY (ca_address_sk) -); + ); -CREATE TABLE customer_demographics ( + CREATE TABLE customer_demographics ( cd_demo_sk Int64, cd_gender LowCardinality(String), cd_marital_status LowCardinality(String), @@ -181,9 +181,9 @@ CREATE TABLE customer_demographics ( cd_dep_employed_count Int32, cd_dep_college_count Int32, PRIMARY KEY (cd_demo_sk) -); + ); -CREATE TABLE customer ( + CREATE TABLE customer ( c_customer_sk Int64, c_customer_id LowCardinality(String), c_current_cdemo_sk Nullable(Int64), @@ -203,9 +203,9 @@ CREATE TABLE customer ( c_email_address LowCardinality(Nullable(String)), c_last_review_date LowCardinality(Nullable(String)), PRIMARY KEY (c_customer_sk) -); + ); -CREATE TABLE date_dim ( + CREATE TABLE date_dim ( d_date_sk UInt32, d_date_id LowCardinality(String), d_date Date, @@ -235,33 +235,33 @@ CREATE TABLE date_dim ( d_current_quarter LowCardinality(String), d_current_year LowCardinality(String), PRIMARY KEY (d_date_sk) -); + ); -CREATE TABLE household_demographics ( + CREATE TABLE household_demographics ( hd_demo_sk Int64, hd_income_band_sk Int64, hd_buy_potential LowCardinality(String), hd_dep_count Int32, hd_vehicle_count Int32, PRIMARY KEY (hd_demo_sk) -); + ); -CREATE TABLE income_band( + CREATE TABLE income_band( ib_income_band_sk Int64, ib_lower_bound Int32, ib_upper_bound Int32, PRIMARY KEY (ib_income_band_sk), -); + ); -CREATE TABLE inventory ( + CREATE TABLE inventory ( inv_date_sk UInt32, inv_item_sk Int64, inv_warehouse_sk Int64, inv_quantity_on_hand Nullable(Int32), PRIMARY KEY (inv_date_sk, inv_item_sk, inv_warehouse_sk), -); + ); -CREATE TABLE item ( + CREATE TABLE item ( i_item_sk Int64, i_item_id LowCardinality(String), i_rec_start_date LowCardinality(Nullable(String)), @@ -285,9 +285,9 @@ CREATE TABLE item ( i_manager_id Nullable(Int32), i_product_name LowCardinality(Nullable(String)), PRIMARY KEY (i_item_sk) -); + ); -CREATE TABLE promotion ( + CREATE TABLE promotion ( p_promo_sk Int64, p_promo_id LowCardinality(String), p_start_date_sk Nullable(UInt32), @@ -308,16 +308,16 @@ CREATE TABLE promotion ( p_purpose LowCardinality(Nullable(String)), p_discount_active LowCardinality(Nullable(String)), PRIMARY KEY (p_promo_sk) -); + ); -CREATE TABLE reason( + CREATE TABLE reason( r_reason_sk Int64, r_reason_id LowCardinality(String), r_reason_desc LowCardinality(String), PRIMARY KEY (r_reason_sk) -); + ); -CREATE TABLE ship_mode( + CREATE TABLE ship_mode( sm_ship_mode_sk Int64, sm_ship_mode_id LowCardinality(String), sm_type LowCardinality(String), @@ -325,9 +325,9 @@ CREATE TABLE ship_mode( sm_carrier LowCardinality(String), sm_contract LowCardinality(String), PRIMARY KEY (sm_ship_mode_sk) -); + ); -CREATE TABLE store_returns ( + CREATE TABLE store_returns ( sr_returned_date_sk Nullable(UInt32), sr_return_time_sk Nullable(Int64), sr_item_sk Int64, @@ -349,9 +349,9 @@ CREATE TABLE store_returns ( sr_store_credit Nullable(Decimal(7,2)), sr_net_loss Nullable(Decimal(7,2)), PRIMARY KEY (sr_item_sk, sr_ticket_number) -); + ); -CREATE TABLE store_sales ( + CREATE TABLE store_sales ( ss_sold_date_sk Nullable(UInt32), ss_sold_time_sk Nullable(Int64), ss_item_sk Int64, @@ -376,9 +376,9 @@ CREATE TABLE store_sales ( ss_net_paid_inc_tax Nullable(Decimal(7,2)), ss_net_profit Nullable(Decimal(7,2)), PRIMARY KEY (ss_item_sk, ss_ticket_number) -); + ); -CREATE TABLE store ( + CREATE TABLE store ( s_store_sk Int64, s_store_id LowCardinality(String), s_rec_start_date LowCardinality(Nullable(String)), @@ -409,9 +409,9 @@ CREATE TABLE store ( s_gmt_offset Nullable(Decimal(7,2)), s_tax_precentage Nullable(Decimal(7,2)), PRIMARY KEY (s_store_sk) -); + ); -CREATE TABLE time_dim ( + CREATE TABLE time_dim ( t_time_sk UInt32, t_time_id LowCardinality(String), t_time UInt32, @@ -423,9 +423,9 @@ CREATE TABLE time_dim ( t_sub_shift LowCardinality(String), t_meal_time LowCardinality(Nullable(String)), PRIMARY KEY (t_time_sk) -); + ); -CREATE TABLE warehouse( + CREATE TABLE warehouse( w_warehouse_sk Int64, w_warehouse_id LowCardinality(String), w_warehouse_name LowCardinality(Nullable(String)), @@ -441,9 +441,9 @@ CREATE TABLE warehouse( w_country LowCardinality(Nullable(String)), w_gmt_offset Decimal(7,2), PRIMARY KEY (w_warehouse_sk) -); + ); -CREATE TABLE web_page( + CREATE TABLE web_page( wp_web_page_sk Int64, wp_web_page_id LowCardinality(String), wp_rec_start_date LowCardinality(Nullable(String)), @@ -459,9 +459,9 @@ CREATE TABLE web_page( wp_image_count Nullable(Int32), wp_max_ad_count Nullable(Int32), PRIMARY KEY (wp_web_page_sk) -); + ); -CREATE TABLE web_returns ( + CREATE TABLE web_returns ( wr_returned_date_sk Nullable(UInt32), wr_returned_time_sk Nullable(Int64), wr_item_sk Int64, @@ -487,9 +487,9 @@ CREATE TABLE web_returns ( wr_account_credit Nullable(Decimal(7,2)), wr_net_loss Nullable(Decimal(7,2)), PRIMARY KEY (wr_item_sk, wr_order_number) -); + ); -CREATE TABLE web_sales ( + CREATE TABLE web_sales ( ws_sold_date_sk Nullable(UInt32), ws_sold_time_sk Nullable(Int64), ws_ship_date_sk Nullable(UInt32), @@ -525,9 +525,9 @@ CREATE TABLE web_sales ( ws_net_paid_inc_ship_tax Decimal(7,2), ws_net_profit Decimal(7,2), PRIMARY KEY (ws_item_sk, ws_order_number) -); + ); -CREATE TABLE web_site ( + CREATE TABLE web_site ( web_site_sk Int64, web_site_id LowCardinality(String), web_rec_start_date LowCardinality(String), @@ -555,41 +555,41 @@ CREATE TABLE web_site ( web_gmt_offset Decimal(7,2), web_tax_percentage Decimal(7,2), PRIMARY KEY (web_site_sk) -); -``` - -The data can be imported as follows: - -```bash -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO call_center FORMAT CSV" < call_center.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO catalog_page FORMAT CSV" < catalog_page.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO catalog_returns FORMAT CSV" < catalog_returns.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO catalog_sales FORMAT CSV" < catalog_sales.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customer FORMAT CSV" < customer.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customer_address FORMAT CSV" < customer_address.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customer_demographics FORMAT CSV" < customer_demographics.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO date_dim FORMAT CSV" < date_dim.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO household_demographics FORMAT CSV" < household_demographics.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO income_band FORMAT CSV" < income_band.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO inventory FORMAT CSV" < inventory.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO item FORMAT CSV" < item.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO promotion FORMAT CSV" < promotion.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO reason FORMAT CSV" < reason.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO ship_mode FORMAT CSV" < ship_mode.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO store FORMAT CSV" < store.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO store_returns FORMAT CSV" < store_returns.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO store_sales FORMAT CSV" < store_sales.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO time_dim FORMAT CSV" < time_dim.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO warehouse FORMAT CSV" < warehouse.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO web_page FORMAT CSV" < web_page.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO web_returns FORMAT CSV" < web_returns.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO web_sales FORMAT CSV" < web_sales.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO web_site FORMAT CSV" < web_site.tbl -``` - -Then run the generated queries. - -::::warning -TPC-DS makes heavy use of correlated subqueries which are at the time of writing (September 2024) not supported by ClickHouse ([issue #6697](https://github.com/ClickHouse/ClickHouse/issues/6697)). -As a result, many of above benchmark queries will fail with errors. -:::: + ); + ``` + + The data can be imported as follows: + + ```bash + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO call_center FORMAT CSV" < call_center.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO catalog_page FORMAT CSV" < catalog_page.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO catalog_returns FORMAT CSV" < catalog_returns.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO catalog_sales FORMAT CSV" < catalog_sales.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customer FORMAT CSV" < customer.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customer_address FORMAT CSV" < customer_address.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customer_demographics FORMAT CSV" < customer_demographics.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO date_dim FORMAT CSV" < date_dim.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO household_demographics FORMAT CSV" < household_demographics.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO income_band FORMAT CSV" < income_band.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO inventory FORMAT CSV" < inventory.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO item FORMAT CSV" < item.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO promotion FORMAT CSV" < promotion.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO reason FORMAT CSV" < reason.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO ship_mode FORMAT CSV" < ship_mode.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO store FORMAT CSV" < store.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO store_returns FORMAT CSV" < store_returns.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO store_sales FORMAT CSV" < store_sales.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO time_dim FORMAT CSV" < time_dim.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO warehouse FORMAT CSV" < warehouse.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO web_page FORMAT CSV" < web_page.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO web_returns FORMAT CSV" < web_returns.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO web_sales FORMAT CSV" < web_sales.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO web_site FORMAT CSV" < web_site.tbl + ``` + + Then run the generated queries. + + ::::warning + TPC-DS makes heavy use of correlated subqueries which are at the time of writing (September 2024) not supported by ClickHouse ([issue #6697](https://github.com/ClickHouse/ClickHouse/issues/6697)). + As a result, many of above benchmark queries will fail with errors. + :::: diff --git a/docs/getting-started/example-datasets/tpch.md b/docs/getting-started/example-datasets/tpch.md index d30249657cc..eea62ebed07 100644 --- a/docs/getting-started/example-datasets/tpch.md +++ b/docs/getting-started/example-datasets/tpch.md @@ -53,27 +53,26 @@ We stick as closely as possible to the rules of the TPC-H specification: - Primary keys are created only for the columns mentioned in section 1.4.2.2 of the specification. - Substitution parameters were replaced by the values for query validation in sections 2.1.x.4 of the specification. - As per section 1.4.2.1, the table definitions do not use the optional `NOT NULL` constraints, even if `dbgen` generates them by default. - The performance of `SELECT` queries in ClickHouse is not affected by the presence or absence of `NOT NULL` constraints. + The performance of `SELECT` queries in ClickHouse is not affected by the presence or absence of `NOT NULL` constraints. - As per section 1.3.1, we use ClickHouse's native datatypes (e.g. `Int32`, `String`) to implement the abstract datatypes mentioned in the - specification (e.g. `Identifier`, `Variable text, size N`). The only effect of this is better readability, the SQL-92 datatypes generated - by `dbgen` (e.g. `INTEGER`, `VARCHAR(40)`) would also work in ClickHouse. + specification (e.g. `Identifier`, `Variable text, size N`). The only effect of this is better readability, the SQL-92 datatypes generated + by `dbgen` (e.g. `INTEGER`, `VARCHAR(40)`) would also work in ClickHouse. - -```sql -CREATE TABLE nation ( + ```sql + CREATE TABLE nation ( n_nationkey Int32, n_name String, n_regionkey Int32, n_comment String) -ORDER BY (n_nationkey); + ORDER BY (n_nationkey); -CREATE TABLE region ( + CREATE TABLE region ( r_regionkey Int32, r_name String, r_comment String) -ORDER BY (r_regionkey); + ORDER BY (r_regionkey); -CREATE TABLE part ( + CREATE TABLE part ( p_partkey Int32, p_name String, p_mfgr String, @@ -83,9 +82,9 @@ CREATE TABLE part ( p_container String, p_retailprice Decimal(15,2), p_comment String) -ORDER BY (p_partkey); + ORDER BY (p_partkey); -CREATE TABLE supplier ( + CREATE TABLE supplier ( s_suppkey Int32, s_name String, s_address String, @@ -93,17 +92,17 @@ CREATE TABLE supplier ( s_phone String, s_acctbal Decimal(15,2), s_comment String) -ORDER BY (s_suppkey); + ORDER BY (s_suppkey); -CREATE TABLE partsupp ( + CREATE TABLE partsupp ( ps_partkey Int32, ps_suppkey Int32, ps_availqty Int32, ps_supplycost Decimal(15,2), ps_comment String) -ORDER BY (ps_partkey, ps_suppkey); + ORDER BY (ps_partkey, ps_suppkey); -CREATE TABLE customer ( + CREATE TABLE customer ( c_custkey Int32, c_name String, c_address String, @@ -112,9 +111,9 @@ CREATE TABLE customer ( c_acctbal Decimal(15,2), c_mktsegment String, c_comment String) -ORDER BY (c_custkey); + ORDER BY (c_custkey); -CREATE TABLE orders ( + CREATE TABLE orders ( o_orderkey Int32, o_custkey Int32, o_orderstatus String, @@ -124,12 +123,12 @@ CREATE TABLE orders ( o_clerk String, o_shippriority Int32, o_comment String) -ORDER BY (o_orderkey); --- The following is an alternative order key which is not compliant with the official TPC-H rules but recommended by sec. 4.5 in --- "Quantifying TPC-H Choke Points and Their Optimizations": --- ORDER BY (o_orderdate, o_orderkey); + ORDER BY (o_orderkey); + -- The following is an alternative order key which is not compliant with the official TPC-H rules but recommended by sec. 4.5 in + -- "Quantifying TPC-H Choke Points and Their Optimizations": + -- ORDER BY (o_orderdate, o_orderkey); -CREATE TABLE lineitem ( + CREATE TABLE lineitem ( l_orderkey Int32, l_partkey Int32, l_suppkey Int32, @@ -146,51 +145,51 @@ CREATE TABLE lineitem ( l_shipinstruct String, l_shipmode String, l_comment String) -ORDER BY (l_orderkey, l_linenumber); --- The following is an alternative order key which is not compliant with the official TPC-H rules but recommended by sec. 4.5 in --- "Quantifying TPC-H Choke Points and Their Optimizations": --- ORDER BY (l_shipdate, l_orderkey, l_linenumber); -``` - -The data can be imported as follows: - -```bash -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO nation FORMAT CSV" < nation.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO region FORMAT CSV" < region.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO part FORMAT CSV" < part.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO partsupp FORMAT CSV" < partsupp.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customer FORMAT CSV" < customer.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO orders FORMAT CSV" < orders.tbl -clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO lineitem FORMAT CSV" < lineitem.tbl -``` - -:::note -Instead of using tpch-kit and generating the tables by yourself, you can alternatively import the data from a public S3 bucket. Make sure -to create empty tables first using above `CREATE` statements. - -```sql --- Scaling factor 1 -INSERT INTO nation SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/nation.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO region SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/region.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO part SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/part.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO supplier SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/supplier.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO partsupp SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/partsupp.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO customer SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/customer.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO orders SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/orders.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO lineitem SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/lineitem.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; - --- Scaling factor 100 -INSERT INTO nation SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/nation.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO region SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/region.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO part SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/part.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO supplier SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/supplier.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO partsupp SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/partsupp.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO customer SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/customer.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO orders SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/orders.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -INSERT INTO lineitem SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/lineitem.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; -```` -::: + ORDER BY (l_orderkey, l_linenumber); + -- The following is an alternative order key which is not compliant with the official TPC-H rules but recommended by sec. 4.5 in + -- "Quantifying TPC-H Choke Points and Their Optimizations": + -- ORDER BY (l_shipdate, l_orderkey, l_linenumber); + ``` + + The data can be imported as follows: + + ```bash + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO nation FORMAT CSV" < nation.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO region FORMAT CSV" < region.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO part FORMAT CSV" < part.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO partsupp FORMAT CSV" < partsupp.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customer FORMAT CSV" < customer.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO orders FORMAT CSV" < orders.tbl + clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO lineitem FORMAT CSV" < lineitem.tbl + ``` + + :::note + Instead of using tpch-kit and generating the tables by yourself, you can alternatively import the data from a public S3 bucket. Make sure + to create empty tables first using above `CREATE` statements. + + ```sql + -- Scaling factor 1 + INSERT INTO nation SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/nation.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO region SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/region.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO part SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/part.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO supplier SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/supplier.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO partsupp SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/partsupp.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO customer SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/customer.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO orders SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/orders.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO lineitem SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/1/lineitem.tbl', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + + -- Scaling factor 100 + INSERT INTO nation SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/nation.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO region SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/region.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO part SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/part.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO supplier SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/supplier.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO partsupp SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/partsupp.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO customer SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/customer.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO orders SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/orders.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + INSERT INTO lineitem SELECT * FROM s3('https://clickhouse-datasets.s3.amazonaws.com/h/100/lineitem.tbl.gz', NOSIGN, CSV) SETTINGS format_csv_delimiter = '|', input_format_defaults_for_omitted_fields = 1, input_format_csv_empty_as_default = 1; + ```` + ::: ## Queries {#queries} @@ -380,17 +379,17 @@ WHERE AND EXISTS ( SELECT * - FROM - lineitem - WHERE - l_orderkey = o_orderkey - AND l_commitdate < l_receiptdate - ) -GROUP BY - o_orderpriority -ORDER BY - o_orderpriority; -``` + FROM + lineitem + WHERE + l_orderkey = o_orderkey + AND l_commitdate < l_receiptdate + ) + GROUP BY + o_orderpriority + ORDER BY + o_orderpriority; + ``` ::::note Until v25.5, the query did not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 @@ -1026,33 +1025,33 @@ WHERE AND EXISTS ( SELECT * - FROM - lineitem l2 - WHERE - l2.l_orderkey = l1.l_orderkey - AND l2.l_suppkey <> l1.l_suppkey - ) - AND NOT EXISTS ( - SELECT + FROM + lineitem l2 + WHERE + l2.l_orderkey = l1.l_orderkey + AND l2.l_suppkey <> l1.l_suppkey + ) + AND NOT EXISTS ( + SELECT * - FROM - lineitem l3 - WHERE - l3.l_orderkey = l1.l_orderkey - AND l3.l_suppkey <> l1.l_suppkey - AND l3.l_receiptdate > l3.l_commitdate - ) - AND s_nationkey = n_nationkey - AND n_name = 'SAUDI ARABIA' -GROUP BY - s_name -ORDER BY - numwait DESC, - s_name; -``` -::::note -Until v25.5, the query did not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 -:::: + FROM + lineitem l3 + WHERE + l3.l_orderkey = l1.l_orderkey + AND l3.l_suppkey <> l1.l_suppkey + AND l3.l_receiptdate > l3.l_commitdate + ) + AND s_nationkey = n_nationkey + AND n_name = 'SAUDI ARABIA' + GROUP BY + s_name + ORDER BY + numwait DESC, + s_name; + ``` + ::::note + Until v25.5, the query did not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 + :::: **Q22** @@ -1085,17 +1084,17 @@ FROM ( AND NOT EXISTS ( SELECT * - FROM - orders - WHERE - o_custkey = c_custkey - ) - ) AS custsale -GROUP BY - cntrycode -ORDER BY - cntrycode; -``` + FROM + orders + WHERE + o_custkey = c_custkey + ) + ) AS custsale + GROUP BY + cntrycode + ORDER BY + cntrycode; + ``` ::::note Until v25.5, the query did not work out-of-the box due to correlated subqueries. Corresponding issue: https://github.com/ClickHouse/ClickHouse/issues/6697 diff --git a/docs/getting-started/example-datasets/tw-weather.md b/docs/getting-started/example-datasets/tw-weather.md index b5d969681bc..9a5c069737b 100644 --- a/docs/getting-started/example-datasets/tw-weather.md +++ b/docs/getting-started/example-datasets/tw-weather.md @@ -172,32 +172,32 @@ To know how to speed this up, please see our blog post on [tuning large data loa 1. Let's see how many rows are inserted: -```sql -SELECT formatReadableQuantity(count()) -FROM tw_weather_data; -``` + ```sql + SELECT formatReadableQuantity(count()) + FROM tw_weather_data; + ``` -```response -┌─formatReadableQuantity(count())─┐ -│ 131.99 million │ -└─────────────────────────────────┘ -``` + ```response + ┌─formatReadableQuantity(count())─┐ + │ 131.99 million │ + └─────────────────────────────────┘ + ``` 2. Let's see how much disk space are used for this table: -```sql -SELECT + ```sql + SELECT formatReadableSize(sum(bytes)) AS disk_size, formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size -FROM system.parts -WHERE (`table` = 'tw_weather_data') AND active -``` - -```response -┌─disk_size─┬─uncompressed_size─┐ -│ 2.13 GiB │ 32.94 GiB │ -└───────────┴───────────────────┘ -``` + FROM system.parts + WHERE (`table` = 'tw_weather_data') AND active + ``` + + ```response + ┌─disk_size─┬─uncompressed_size─┐ + │ 2.13 GiB │ 32.94 GiB │ + └───────────┴───────────────────┘ + ``` ## Sample queries {#sample-queries} diff --git a/docs/getting-started/example-datasets/uk-price-paid.md b/docs/getting-started/example-datasets/uk-price-paid.md index a80137b76b9..3ce924e1e2a 100644 --- a/docs/getting-started/example-datasets/uk-price-paid.md +++ b/docs/getting-started/example-datasets/uk-price-paid.md @@ -50,11 +50,11 @@ We will use the `url` function to stream the data into ClickHouse. We need to pr - transforming the `is_new` field from a single-character string (`Y`/`N`) to a [UInt8](/sql-reference/data-types/int-uint) field with 0 or 1 - drop the last two columns since they all have the same value (which is 0) -The `url` function streams the data from the web server into your ClickHouse table. The following command inserts 5 million rows into the `uk_price_paid` table: + The `url` function streams the data from the web server into your ClickHouse table. The following command inserts 5 million rows into the `uk_price_paid` table: -```sql -INSERT INTO uk.uk_price_paid -SELECT + ```sql + INSERT INTO uk.uk_price_paid + SELECT toUInt32(price_string) AS price, parseDateTimeBestEffortUS(time) AS date, splitByChar(' ', postcode)[1] AS postcode1, @@ -69,7 +69,7 @@ SELECT town, district, county -FROM url( + FROM url( 'http://prod1.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv', 'CSV', 'uuid_string String, @@ -88,10 +88,10 @@ FROM url( county String, d String, e String' -) SETTINGS max_http_get_redirects=10; -``` + ) SETTINGS max_http_get_redirects=10; + ``` -Wait for the data to insert - it will take a minute or two depending on the network speed. + Wait for the data to insert - it will take a minute or two depending on the network speed. ## Validate the data {#validate-data} @@ -171,4 +171,3 @@ We can speed up these queries with projections. See ["Projections"](/data-modeli ### Test it in the playground {#playground} The dataset is also available in the [Online Playground](https://sql.clickhouse.com?query_id=TRCWH5ZETY4SEEK8ISCCAX). - diff --git a/docs/getting-started/example-datasets/wikistat.md b/docs/getting-started/example-datasets/wikistat.md index f13fe397cb7..c76c7097d0e 100644 --- a/docs/getting-started/example-datasets/wikistat.md +++ b/docs/getting-started/example-datasets/wikistat.md @@ -51,10 +51,10 @@ Loading the data: ```shell clickhouse-local --query " - WITH replaceRegexpOne(_path, '^.+pageviews-(\\d{4})(\\d{2})(\\d{2})-(\\d{2})(\\d{2})(\\d{2}).gz$', '\1-\2-\3 \4-\5-\6')::DateTime AS time, + WITH replaceRegexpOne(_path, '^.+pageviews-(\\d{4})(\\d{2})(\\d{2})-(\\d{2})(\\d{2})(\\d{2}).gz$', '\1-\2-\3 \4-\5-\6')::DateTime AS time, extractGroups(line, '^([^ \\.]+)(\\.[^ ]+)? +([^ ]+) +(\\d+) +(\\d+)$') AS values - SELECT - time, + SELECT + time, values[1] AS project, values[2] AS subproject, values[3] AS path, diff --git a/docs/getting-started/example-datasets/youtube-dislikes.md b/docs/getting-started/example-datasets/youtube-dislikes.md index e3676c20d6a..671d1ef9c8b 100644 --- a/docs/getting-started/example-datasets/youtube-dislikes.md +++ b/docs/getting-started/example-datasets/youtube-dislikes.md @@ -23,46 +23,46 @@ The steps below will easily work on a local install of ClickHouse too. The only 1. Let's see what the data looks like. The `s3cluster` table function returns a table, so we can `DESCRIBE` the result: -```sql -DESCRIBE s3( + ```sql + DESCRIBE s3( 'https://clickhouse-public-datasets.s3.amazonaws.com/youtube/original/files/*.zst', 'JSONLines' -); -``` - -ClickHouse infers the following schema from the JSON file: - -```response -┌─name────────────────┬─type───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ -│ id │ Nullable(String) │ │ │ │ │ │ -│ fetch_date │ Nullable(String) │ │ │ │ │ │ -│ upload_date │ Nullable(String) │ │ │ │ │ │ -│ title │ Nullable(String) │ │ │ │ │ │ -│ uploader_id │ Nullable(String) │ │ │ │ │ │ -│ uploader │ Nullable(String) │ │ │ │ │ │ -│ uploader_sub_count │ Nullable(Int64) │ │ │ │ │ │ -│ is_age_limit │ Nullable(Bool) │ │ │ │ │ │ -│ view_count │ Nullable(Int64) │ │ │ │ │ │ -│ like_count │ Nullable(Int64) │ │ │ │ │ │ -│ dislike_count │ Nullable(Int64) │ │ │ │ │ │ -│ is_crawlable │ Nullable(Bool) │ │ │ │ │ │ -│ is_live_content │ Nullable(Bool) │ │ │ │ │ │ -│ has_subtitles │ Nullable(Bool) │ │ │ │ │ │ -│ is_ads_enabled │ Nullable(Bool) │ │ │ │ │ │ -│ is_comments_enabled │ Nullable(Bool) │ │ │ │ │ │ -│ description │ Nullable(String) │ │ │ │ │ │ -│ rich_metadata │ Array(Tuple(call Nullable(String), content Nullable(String), subtitle Nullable(String), title Nullable(String), url Nullable(String))) │ │ │ │ │ │ -│ super_titles │ Array(Tuple(text Nullable(String), url Nullable(String))) │ │ │ │ │ │ -│ uploader_badges │ Nullable(String) │ │ │ │ │ │ -│ video_badges │ Nullable(String) │ │ │ │ │ │ -└─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ -``` + ); + ``` + + ClickHouse infers the following schema from the JSON file: + + ```response + ┌─name────────────────┬─type───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ + │ id │ Nullable(String) │ │ │ │ │ │ + │ fetch_date │ Nullable(String) │ │ │ │ │ │ + │ upload_date │ Nullable(String) │ │ │ │ │ │ + │ title │ Nullable(String) │ │ │ │ │ │ + │ uploader_id │ Nullable(String) │ │ │ │ │ │ + │ uploader │ Nullable(String) │ │ │ │ │ │ + │ uploader_sub_count │ Nullable(Int64) │ │ │ │ │ │ + │ is_age_limit │ Nullable(Bool) │ │ │ │ │ │ + │ view_count │ Nullable(Int64) │ │ │ │ │ │ + │ like_count │ Nullable(Int64) │ │ │ │ │ │ + │ dislike_count │ Nullable(Int64) │ │ │ │ │ │ + │ is_crawlable │ Nullable(Bool) │ │ │ │ │ │ + │ is_live_content │ Nullable(Bool) │ │ │ │ │ │ + │ has_subtitles │ Nullable(Bool) │ │ │ │ │ │ + │ is_ads_enabled │ Nullable(Bool) │ │ │ │ │ │ + │ is_comments_enabled │ Nullable(Bool) │ │ │ │ │ │ + │ description │ Nullable(String) │ │ │ │ │ │ + │ rich_metadata │ Array(Tuple(call Nullable(String), content Nullable(String), subtitle Nullable(String), title Nullable(String), url Nullable(String))) │ │ │ │ │ │ + │ super_titles │ Array(Tuple(text Nullable(String), url Nullable(String))) │ │ │ │ │ │ + │ uploader_badges │ Nullable(String) │ │ │ │ │ │ + │ video_badges │ Nullable(String) │ │ │ │ │ │ + └─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ + ``` 2. Based on the inferred schema, we cleaned up the data types and added a primary key. Define the following table: -```sql -CREATE TABLE youtube -( + ```sql + CREATE TABLE youtube + ( `id` String, `fetch_date` DateTime, `upload_date_str` String, @@ -84,21 +84,21 @@ CREATE TABLE youtube `super_titles` Array(Tuple(text String, url String)), `uploader_badges` String, `video_badges` String -) -ENGINE = MergeTree -ORDER BY (uploader, upload_date) -``` + ) + ENGINE = MergeTree + ORDER BY (uploader, upload_date) + ``` 3. The following command streams the records from the S3 files into the `youtube` table. -:::important -This inserts a lot of data - 4.65 billion rows. If you do not want the entire dataset, simply add a `LIMIT` clause with the desired number of rows. -::: + :::important + This inserts a lot of data - 4.65 billion rows. If you do not want the entire dataset, simply add a `LIMIT` clause with the desired number of rows. + ::: -```sql -INSERT INTO youtube -SETTINGS input_format_null_as_default = 1 -SELECT + ```sql + INSERT INTO youtube + SETTINGS input_format_null_as_default = 1 + SELECT id, parseDateTimeBestEffortUSOrZero(toString(fetch_date)) AS fetch_date, upload_date AS upload_date_str, @@ -120,13 +120,13 @@ SELECT super_titles, ifNull(uploader_badges, '') AS uploader_badges, ifNull(video_badges, '') AS video_badges -FROM s3( + FROM s3( 'https://clickhouse-public-datasets.s3.amazonaws.com/youtube/original/files/*.zst', 'JSONLines' -) -``` + ) + ``` -Some comments about our `INSERT` command: + Some comments about our `INSERT` command: - The `parseDateTimeBestEffortUSOrZero` function is handy when the incoming date fields may not be in the proper format. If `fetch_date` does not get parsed properly, it will be set to `0` - The `upload_date` column contains valid dates, but it also contains strings like "4 hours ago" - which is certainly not a valid date. We decided to store the original value in `upload_date_str` and attempt to parse it with `toDate(parseDateTimeBestEffortUSOrZero(upload_date::String))`. If the parsing fails we just get `0` @@ -134,94 +134,94 @@ Some comments about our `INSERT` command: 4. Open a new tab in the SQL Console of ClickHouse Cloud (or a new `clickhouse-client` window) and watch the count increase. It will take a while to insert 4.56B rows, depending on your server resources. (Without any tweaking of settings, it takes about 4.5 hours.) -```sql -SELECT formatReadableQuantity(count()) -FROM youtube -``` + ```sql + SELECT formatReadableQuantity(count()) + FROM youtube + ``` -```response -┌─formatReadableQuantity(count())─┐ -│ 4.56 billion │ -└─────────────────────────────────┘ -``` + ```response + ┌─formatReadableQuantity(count())─┐ + │ 4.56 billion │ + └─────────────────────────────────┘ + ``` 5. Once the data is inserted, go ahead and count the number of dislikes of your favorite videos or channels. Let's see how many videos were uploaded by ClickHouse: -```sql -SELECT count() -FROM youtube -WHERE uploader = 'ClickHouse'; -``` + ```sql + SELECT count() + FROM youtube + WHERE uploader = 'ClickHouse'; + ``` -```response -┌─count()─┐ -│ 84 │ -└─────────┘ + ```response + ┌─count()─┐ + │ 84 │ + └─────────┘ -1 row in set. Elapsed: 0.570 sec. Processed 237.57 thousand rows, 5.77 MB (416.54 thousand rows/s., 10.12 MB/s.) -``` + 1 row in set. Elapsed: 0.570 sec. Processed 237.57 thousand rows, 5.77 MB (416.54 thousand rows/s., 10.12 MB/s.) + ``` -:::note -The query above runs so quickly because we chose `uploader` as the first column of the primary key - so it only had to process 237k rows. -::: + :::note + The query above runs so quickly because we chose `uploader` as the first column of the primary key - so it only had to process 237k rows. + ::: 6. Let's look and likes and dislikes of ClickHouse videos: -```sql -SELECT + ```sql + SELECT title, like_count, dislike_count -FROM youtube -WHERE uploader = 'ClickHouse' -ORDER BY dislike_count DESC; -``` + FROM youtube + WHERE uploader = 'ClickHouse' + ORDER BY dislike_count DESC; + ``` -The response looks like: + The response looks like: -```response -┌─title────────────────────────────────────────────────────────────────────────────────────────────────┬─like_count─┬─dislike_count─┐ -│ ClickHouse v21.11 Release Webinar │ 52 │ 3 │ -│ ClickHouse Introduction │ 97 │ 3 │ -│ Casa Modelo Algarve │ 180 │ 3 │ -│ Профайлер запросов: трудный путь │ 33 │ 3 │ -│ ClickHouse в Курсометре │ 4 │ 2 │ -│ 10 Good Reasons to Use ClickHouse │ 27 │ 2 │ -... - -84 rows in set. Elapsed: 0.013 sec. Processed 155.65 thousand rows, 16.94 MB (11.96 million rows/s., 1.30 GB/s.) -``` + ```response + ┌─title────────────────────────────────────────────────────────────────────────────────────────────────┬─like_count─┬─dislike_count─┐ + │ ClickHouse v21.11 Release Webinar │ 52 │ 3 │ + │ ClickHouse Introduction │ 97 │ 3 │ + │ Casa Modelo Algarve │ 180 │ 3 │ + │ Профайлер запросов: трудный путь │ 33 │ 3 │ + │ ClickHouse в Курсометре │ 4 │ 2 │ + │ 10 Good Reasons to Use ClickHouse │ 27 │ 2 │ + ... + + 84 rows in set. Elapsed: 0.013 sec. Processed 155.65 thousand rows, 16.94 MB (11.96 million rows/s., 1.30 GB/s.) + ``` 7. Here is a search for videos with **ClickHouse** in the `title` or `description` fields: -```sql -SELECT + ```sql + SELECT view_count, like_count, dislike_count, concat('https://youtu.be/', id) AS url, title -FROM youtube -WHERE (title ILIKE '%ClickHouse%') OR (description ILIKE '%ClickHouse%') -ORDER BY + FROM youtube + WHERE (title ILIKE '%ClickHouse%') OR (description ILIKE '%ClickHouse%') + ORDER BY like_count DESC, view_count DESC; -``` + ``` -This query has to process every row, and also parse through two columns of strings. Even then, we get decent performance at 4.15M rows/second: + This query has to process every row, and also parse through two columns of strings. Even then, we get decent performance at 4.15M rows/second: -```response -1174 rows in set. Elapsed: 1099.368 sec. Processed 4.56 billion rows, 1.98 TB (4.15 million rows/s., 1.80 GB/s.) -``` + ```response + 1174 rows in set. Elapsed: 1099.368 sec. Processed 4.56 billion rows, 1.98 TB (4.15 million rows/s., 1.80 GB/s.) + ``` -The results look like: + The results look like: -```response -┌─view_count─┬─like_count─┬─dislike_count─┬─url──────────────────────────┬─title──────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ 1919 │ 63 │ 1 │ https://youtu.be/b9MeoOtAivQ │ ClickHouse v21.10 Release Webinar │ -│ 8710 │ 62 │ 4 │ https://youtu.be/PeV1mC2z--M │ What is JDBC DriverManager? | JDBC │ -│ 3534 │ 62 │ 1 │ https://youtu.be/8nWRhK9gw10 │ CLICKHOUSE - Arquitetura Modular │ -``` + ```response + ┌─view_count─┬─like_count─┬─dislike_count─┬─url──────────────────────────┬─title──────────────────────────────────────────────────────────────────────────────────────────────────┐ + │ 1919 │ 63 │ 1 │ https://youtu.be/b9MeoOtAivQ │ ClickHouse v21.10 Release Webinar │ + │ 8710 │ 62 │ 4 │ https://youtu.be/PeV1mC2z--M │ What is JDBC DriverManager? | JDBC │ + │ 3534 │ 62 │ 1 │ https://youtu.be/8nWRhK9gw10 │ CLICKHOUSE - Arquitetura Modular │ + ``` ## Questions {#questions} @@ -280,7 +280,6 @@ ORDER BY Enabling comments seems to be correlated with a higher rate of engagement. - ### How does the number of videos change over time - notable events? {#how-does-the-number-of-videos-change-over-time---notable-events} ```sql @@ -319,7 +318,6 @@ ORDER BY month ASC; A spike of uploaders [around covid is noticeable](https://www.theverge.com/2020/3/27/21197642/youtube-with-me-style-videos-views-coronavirus-cook-workout-study-home-beauty). - ### More subtitles over time and when {#more-subtitles-over-time-and-when} With advances in speech recognition, it's easier than ever to create subtitles for video with youtube adding auto-captioning in late 2009 - was the jump then? @@ -357,7 +355,6 @@ ORDER BY month ASC; The data results show a spike in 2009. Apparently at that, time YouTube was removing their community captions feature, which allowed you to upload captions for other people's video. This prompted a very successful campaign to have creators add captions to their videos for hard of hearing and deaf viewers. - ### Top uploaders over time {#top-uploaders-over-time} ```sql diff --git a/docs/getting-started/index.md b/docs/getting-started/index.md index 54e73320931..29bd7f1335a 100644 --- a/docs/getting-started/index.md +++ b/docs/getting-started/index.md @@ -15,13 +15,13 @@ We have a lot of resources for helping you get started and learn how ClickHouse - If you need to get ClickHouse up and running, check out our [Quick Start](/get-started/quick-start) - The [ClickHouse Tutorial](../tutorial.md) analyzes a dataset of New York City taxi rides -In addition, the sample datasets provide a great experience on working with ClickHouse, -learning important techniques and tricks, and seeing how to take advantage of the many powerful -functions in ClickHouse. The sample datasets include: + In addition, the sample datasets provide a great experience on working with ClickHouse, + learning important techniques and tricks, and seeing how to take advantage of the many powerful + functions in ClickHouse. The sample datasets include: - + - + - + diff --git a/docs/getting-started/install/_snippets/_deb_install.md b/docs/getting-started/install/_snippets/_deb_install.md index 31e1fa6532d..9b3346bada8 100644 --- a/docs/getting-started/install/_snippets/_deb_install.md +++ b/docs/getting-started/install/_snippets/_deb_install.md @@ -6,108 +6,75 @@ import TabItem from '@theme/TabItem'; > It is recommended to use official pre-compiled `deb` packages for **Debian** or **Ubuntu**. - ## Setup the Debian repository {#setup-the-debian-repository} - To install ClickHouse run the following commands: - ```bash # Install prerequisite packages sudo apt-get install -y apt-transport-https ca-certificates curl gnupg - # Download the ClickHouse GPG key and store it in the keyring curl -fsSL 'https://packages.clickhouse.com/rpm/lts/repodata/repomd.xml.key' | sudo gpg --dearmor -o /usr/share/keyrings/clickhouse-keyring.gpg - # Get the system architecture ARCH=$(dpkg --print-architecture) - # Add the ClickHouse repository to apt sources echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg arch=${ARCH}] https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list - # Update apt package lists sudo apt-get update ``` - - You can replace `stable` with `lts` to use different [release kinds](/knowledgebase/production) based on your needs. - You can download and install packages manually from [packages.clickhouse.com](https://packages.clickhouse.com/deb/pool/main/c/). -
Old distributions method for installing the deb-packages - ```bash # Install prerequisite packages sudo apt-get install apt-transport-https ca-certificates dirmngr - # Add the ClickHouse GPG key to authenticate packages sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754 - # Add the ClickHouse repository to apt sources echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee \ - /etc/apt/sources.list.d/clickhouse.list - +/etc/apt/sources.list.d/clickhouse.list # Update apt package lists sudo apt-get update - # Install ClickHouse server and client packages sudo apt-get install -y clickhouse-server clickhouse-client - # Start the ClickHouse server service sudo service clickhouse-server start - # Launch the ClickHouse command line client clickhouse-client # or "clickhouse-client --password" if you set up a password. ``` -
- ## Install ClickHouse server and client {#install-clickhouse-server-and-client} - ```bash sudo apt-get install -y clickhouse-server clickhouse-client ``` - ## Start ClickHouse {#start-clickhouse-server} - To start the ClickHouse server, run: - ```bash sudo service clickhouse-server start ``` - To start ClickHouse client, run: - ```bash clickhouse-client ``` - If you set up a password for your server, then you will need to run: - ```bash clickhouse-client --password ``` - ## Install standalone ClickHouse Keeper {#install-standalone-clickhouse-keeper} - :::tip In production environments we strongly recommend running ClickHouse Keeper on dedicated nodes. -In test environments, if you decide to run ClickHouse Server and ClickHouse Keeper on the same server, +In test environments, if you decide to run ClickHouse Server and ClickHouse Keeper on the same server, then you do not need to install ClickHouse Keeper as it is included with ClickHouse server. ::: - To install `clickhouse-keeper` on standalone ClickHouse Keeper servers, run: - ```bash sudo apt-get install -y clickhouse-keeper ``` - ## Enable and start ClickHouse Keeper {#enable-and-start-clickhouse-keeper} - ```bash sudo systemctl enable clickhouse-keeper sudo systemctl start clickhouse-keeper sudo systemctl status clickhouse-keeper ``` -
## Packages {#packages} diff --git a/docs/getting-started/install/_snippets/_docker.md b/docs/getting-started/install/_snippets/_docker.md index e32838b49d6..57068811e77 100644 --- a/docs/getting-started/install/_snippets/_docker.md +++ b/docs/getting-started/install/_snippets/_docker.md @@ -1,7 +1,7 @@ # Install ClickHouse using Docker The guide on [Docker Hub](https://hub.docker.com/r/clickhouse/clickhouse-server/) -is reproduced below for convenience. The Docker images available make use of +is reproduced below for convenience. The Docker images available make use of the official ClickHouse deb packages. Docker pull command: @@ -21,14 +21,14 @@ docker pull clickhouse/clickhouse-server ### Compatibility {#compatibility} - The amd64 image requires support for [SSE3 instructions](https://en.wikipedia.org/wiki/SSE3). - Virtually all x86 CPUs after 2005 support SSE3. + Virtually all x86 CPUs after 2005 support SSE3. - The arm64 image requires support for the [ARMv8.2-A architecture](https://en.wikipedia.org/wiki/AArch64#ARMv8.2-A) and - additionally the Load-Acquire RCpc register. The register is optional in version ARMv8.2-A and mandatory in - [ARMv8.3-A](https://en.wikipedia.org/wiki/AArch64#ARMv8.3-A). Supported in Graviton >=2, Azure and GCP instances. - Examples for unsupported devices are Raspberry Pi 4 (ARMv8.0-A) and Jetson AGX Xavier/Orin (ARMv8.2-A). + additionally the Load-Acquire RCpc register. The register is optional in version ARMv8.2-A and mandatory in + [ARMv8.3-A](https://en.wikipedia.org/wiki/AArch64#ARMv8.3-A). Supported in Graviton >=2, Azure and GCP instances. + Examples for unsupported devices are Raspberry Pi 4 (ARMv8.0-A) and Jetson AGX Xavier/Orin (ARMv8.2-A). - Since ClickHouse 24.11 Ubuntu images began using `ubuntu:22.04` as its base image. It requires docker version >= `20.10.10` - containing [patch](https://github.com/moby/moby/commit/977283509f75303bc6612665a04abf76ff1d2468). As a workaround you could - use `docker run --security-opt seccomp=unconfined` instead, however this has security implications. + containing [patch](https://github.com/moby/moby/commit/977283509f75303bc6612665a04abf76ff1d2468). As a workaround you could + use `docker run --security-opt seccomp=unconfined` instead, however this has security implications. ## How to use this image {#how-to-use-image} @@ -101,14 +101,14 @@ Typically you may want to mount the following folders inside your container to a - `/var/lib/clickhouse/` - main folder where ClickHouse stores the data - `/var/log/clickhouse-server/` - logs -```bash -docker run -d \ + ```bash + docker run -d \ -v "$PWD/ch_data:/var/lib/clickhouse/" \ -v "$PWD/ch_logs:/var/log/clickhouse-server/" \ --name some-clickhouse-server --ulimit nofile=262144:262144 clickhouse/clickhouse-server -``` + ``` -You may also want to mount: + You may also want to mount: - `/etc/clickhouse-server/config.d/*.xml` - files with server configuration adjustments - `/etc/clickhouse-server/users.d/*.xml` - files with user settings adjustments @@ -178,7 +178,7 @@ docker run --rm -e CLICKHOUSE_SKIP_USER_SETUP=1 -p 9000:9000/tcp clickhouse/clic ## How to extend this image {#how-to-extend-image} -To perform additional initialization in an image derived from this one, add one or more `*.sql`, `*.sql.gz`, or `*.sh` scripts under `/docker-entrypoint-initdb.d`. After the entrypoint calls `initdb`, it will run any `*.sql` files, run any executable `*.sh` scripts, and source any non-executable `*.sh` scripts found in that directory to do further initialization before starting the service. +To perform additional initialization in an image derived from this one, add one or more `*.sql`, `*.sql.gz`, or `*.sh` scripts under `/docker-entrypoint-initdb.d`. After the entrypoint calls `initdb`, it will run any `*.sql` files, run any executable `*.sh` scripts, and source any non-executable `*.sh` scripts found in that directory to do further initialization before starting the service. Also, you can provide environment variables `CLICKHOUSE_USER` & `CLICKHOUSE_PASSWORD` that will be used for clickhouse-client during initialization. For example, to add another user and database, add the following to `/docker-entrypoint-initdb.d/init-db.sh`: diff --git a/docs/getting-started/install/_snippets/_linux_tar_install.md b/docs/getting-started/install/_snippets/_linux_tar_install.md index e3157c768d5..fbe506636e1 100644 --- a/docs/getting-started/install/_snippets/_linux_tar_install.md +++ b/docs/getting-started/install/_snippets/_linux_tar_install.md @@ -3,96 +3,70 @@ > It is recommended to use official pre-compiled `tgz` archives for all Linux distributions, where installation of `deb` or `rpm` packages is not possible. - ## Download and install latest stable version {#install-latest-stable} - The required version can be downloaded with `curl` or `wget` from repository https://packages.clickhouse.com/tgz/. After that downloaded archives should be unpacked and installed with installation scripts. - Below is an example of how to install the latest stable version. - :::note For production environments, it's recommended to use the latest `stable`-version. You can find the release number on this [GitHub page](https://github.com/ClickHouse/ClickHouse/tags) with postfix `-stable`. ::: - ## Get the latest ClickHouse version {#get-latest-version} - Get the latest ClickHouse version from GitHub and store it in `LATEST_VERSION` variable. - ```bash LATEST_VERSION=$(curl -s https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/utils/list-versions/version_date.tsv | \ - grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -V -r | head -n 1) +grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -V -r | head -n 1) export LATEST_VERSION ``` - ## Detect your system architecture {#detect-system-architecture} - Detect the system architecture and set the ARCH variable accordingly: - ```bash case $(uname -m) in - x86_64) ARCH=amd64 ;; # For Intel/AMD 64-bit processors - aarch64) ARCH=arm64 ;; # For ARM 64-bit processors - *) echo "Unknown architecture $(uname -m)"; exit 1 ;; # Exit if architecture isn't supported +x86_64) ARCH=amd64 ;; # For Intel/AMD 64-bit processors +aarch64) ARCH=arm64 ;; # For ARM 64-bit processors +*) echo "Unknown architecture $(uname -m)"; exit 1 ;; # Exit if architecture isn't supported esac ``` - ## Download tarballs for each ClickHouse component {#download-tarballs} - -Download tarballs for each ClickHouse component. The loop tries architecture-specific +Download tarballs for each ClickHouse component. The loop tries architecture-specific packages first, then falls back to generic ones. - ```bash for PKG in clickhouse-common-static clickhouse-common-static-dbg clickhouse-server clickhouse-client clickhouse-keeper do - curl -fO "https://packages.clickhouse.com/tgz/stable/$PKG-$LATEST_VERSION-${ARCH}.tgz" \ - || curl -fO "https://packages.clickhouse.com/tgz/stable/$PKG-$LATEST_VERSION.tgz" +curl -fO "https://packages.clickhouse.com/tgz/stable/$PKG-$LATEST_VERSION-${ARCH}.tgz" \ +|| curl -fO "https://packages.clickhouse.com/tgz/stable/$PKG-$LATEST_VERSION.tgz" done ``` - ## Extract and install packages {#extract-and-install} - Run the commands below to extract and install the following packages: - `clickhouse-common-static` - ```bash # Extract and install clickhouse-common-static package tar -xzvf "clickhouse-common-static-$LATEST_VERSION-${ARCH}.tgz" \ - || tar -xzvf "clickhouse-common-static-$LATEST_VERSION.tgz" +|| tar -xzvf "clickhouse-common-static-$LATEST_VERSION.tgz" sudo "clickhouse-common-static-$LATEST_VERSION/install/doinst.sh" ``` - - - `clickhouse-common-static-dbg` - ```bash # Extract and install debug symbols package tar -xzvf "clickhouse-common-static-dbg-$LATEST_VERSION-${ARCH}.tgz" \ - || tar -xzvf "clickhouse-common-static-dbg-$LATEST_VERSION.tgz" +|| tar -xzvf "clickhouse-common-static-dbg-$LATEST_VERSION.tgz" sudo "clickhouse-common-static-dbg-$LATEST_VERSION/install/doinst.sh" ``` - - `clickhouse-server` - ```bash # Extract and install server package with configuration tar -xzvf "clickhouse-server-$LATEST_VERSION-${ARCH}.tgz" \ - || tar -xzvf "clickhouse-server-$LATEST_VERSION.tgz" +|| tar -xzvf "clickhouse-server-$LATEST_VERSION.tgz" sudo "clickhouse-server-$LATEST_VERSION/install/doinst.sh" configure sudo /etc/init.d/clickhouse-server start # Start the server ``` - - `clickhouse-client` - ```bash # Extract and install client package tar -xzvf "clickhouse-client-$LATEST_VERSION-${ARCH}.tgz" \ - || tar -xzvf "clickhouse-client-$LATEST_VERSION.tgz" +|| tar -xzvf "clickhouse-client-$LATEST_VERSION.tgz" sudo "clickhouse-client-$LATEST_VERSION/install/doinst.sh" ``` - - - diff --git a/docs/getting-started/install/_snippets/_macos.md b/docs/getting-started/install/_snippets/_macos.md index d3b21560ff4..7ad22b3333b 100644 --- a/docs/getting-started/install/_snippets/_macos.md +++ b/docs/getting-started/install/_snippets/_macos.md @@ -6,95 +6,61 @@ import privacy_allow from "@site/static/images/knowledgebase/fix-the-developer-v # Install ClickHouse using Homebrew - ## Install using the community Homebrew formula {#install-using-community-homebrew-formula} - To install ClickHouse on macOS using [Homebrew](https://brew.sh/), you can use the ClickHouse community [homebrew formula](https://formulae.brew.sh/cask/clickhouse). - ```bash brew install --cask clickhouse ``` - ## Fix the developer verification error in macOS {#fix-developer-verification-error-macos} - If you install ClickHouse using `brew`, you may encounter an error from MacOS. By default, MacOS will not run applications or tools created by a developer who cannot be verified. - When attempting to run any `clickhouse` command, you may see this error: - MacOS developer verification error dialog - To get around this verification error, you need to remove the app from MacOS' quarantine bin either by finding the appropriate setting in your System Settings window, using the terminal, or by re-installing ClickHouse. - ### System settings process {#system-settings-process} - The easiest way to remove the `clickhouse` executable from the quarantine bin is to: - 1. Open **System settings**. 1. Navigate to **Privacy & Security**: - - MacOS Privacy & Security settings default view - +MacOS Privacy & Security settings default view 1. Scroll to the bottom of the window to find a message saying _"clickhouse-macos-aarch64" was blocked from use because it is not from an identified developer". 1. Click **Allow Anyway**. - - MacOS Privacy & Security settings showing Allow Anyway button - +MacOS Privacy & Security settings showing Allow Anyway button 1. Enter your MacOS user password. - You should now be able to run `clickhouse` commands in your terminal. - ### Terminal process {#terminal-process} - Sometimes pressing the `Allow Anyway` button doesn't doesn't fix this issue, in which case you can also perform this process using the command-line. Or you might just prefer using the command line! - First find out where Homebrew installed the `clickhouse` executable: - ```shell which clickhouse ``` - This should output something like: - ```shell /opt/homebrew/bin/clickhouse ``` - Remove `clickhouse` from the quarantine bin by running `xattr -d com.apple.quarantine` following by the path from the previous command: - ```shell xattr -d com.apple.quarantine /opt/homebrew/bin/clickhouse ``` - You should now be able to run the `clickhouse` executable: - ```shell clickhouse ``` - This should output something like: - ```bash Use one of the following commands: clickhouse local [args] clickhouse client [args] clickhouse benchmark [args] ... - ## Fix the issue by reinstalling ClickHouse {#fix-issue} - Brew has a command-line option which avoids quarantining installed binaries in the first place. - First, uninstall ClickHouse: - ```shell brew uninstall clickhouse ``` - Now reinstall ClickHouse with `--no-quarantine`: - ```shell brew install --no-quarantine clickhouse ``` diff --git a/docs/getting-started/install/_snippets/_quick_install.md b/docs/getting-started/install/_snippets/_quick_install.md index ea34cbaab28..a8b47ba4f89 100644 --- a/docs/getting-started/install/_snippets/_quick_install.md +++ b/docs/getting-started/install/_snippets/_quick_install.md @@ -1,77 +1,56 @@ # Install ClickHouse via script using curl -If you don't need to install ClickHouse for production, the quickest way to get +If you don't need to install ClickHouse for production, the quickest way to get set up is to run an install script using curl. The script will determine a suitable binary for your OS. - ## Install ClickHouse using curl {#install-clickhouse-using-curl} - Run the following comand to download a single binary for your operating system. - ```bash curl https://clickhouse.com/ | sh ``` - :::note For Mac users: If you are getting errors that the developer of the binary cannot be verified, please see [here](/knowledgebase/fix-developer-verification-error-in-macos). ::: - ## Start clickhouse-local {#start-clickhouse-local} - -`clickhouse-local` allows you to process local and remote files using ClickHouse's +`clickhouse-local` allows you to process local and remote files using ClickHouse's powerful SQL syntax and without the need for configuration. Table data is stored -in a temporary location, meaning that after a restart of `clickhouse-local` +in a temporary location, meaning that after a restart of `clickhouse-local` previously created tables are no longer available. - Run the following command to start [clickhouse-local](/operations/utilities/clickhouse-local): - ```bash ./clickhouse ``` - ## Start clickhouse-server {#start-clickhouse-server} - Should you wish to persist data, you'll want to run `clickhouse-server`. You can start the ClickHouse server using the following command: - ```bash ./clickhouse server ``` - ## Start clickhouse-client {#start-clickhouse-client} - With the server up and running, open a new terminal window and run the following command to launch `clickhouse-client`: - ```bash ./clickhouse client ``` - -You will see something like this: - +You will see something like this: ```response ./clickhouse client ClickHouse client version 24.5.1.117 (official build). Connecting to localhost:9000 as user default. Connected to ClickHouse server version 24.5.1. - local-host :) ``` - Table data is stored in the current directory and still available after a restart of ClickHouse server. If necessary, you can pass -`-C config.xml` as an additional command line argument to `./clickhouse server` +`-C config.xml` as an additional command line argument to `./clickhouse server` and provide further configuration in a configuration -file. All available configuration settings are documented [here](/operations/server-configuration-parameters/settings) and in the +file. All available configuration settings are documented [here](/operations/server-configuration-parameters/settings) and in the [example configuration file template](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/config.xml). - You are now ready to start sending SQL commands to ClickHouse! - :::tip The [Quick Start](/get-started/quick-start) walks through the steps for creating tables and inserting data. ::: - diff --git a/docs/getting-started/install/_snippets/_rpm_install.md b/docs/getting-started/install/_snippets/_rpm_install.md index 94ae1de6d88..a0f2cfceb3e 100644 --- a/docs/getting-started/install/_snippets/_rpm_install.md +++ b/docs/getting-started/install/_snippets/_rpm_install.md @@ -1,88 +1,63 @@ # Install ClickHouse on rpm-based distributions {#from-rpm-packages} -> It is recommended to use official pre-compiled `rpm` packages for **CentOS**, **RedHat**, and all other rpm-based +> It is recommended to use official pre-compiled `rpm` packages for **CentOS**, **RedHat**, and all other rpm-based > Linux distributions. - ## Setup the RPM repository {#setup-the-rpm-repository} - Add the official repository by running the following command: - ```bash sudo yum install -y yum-utils sudo yum-config-manager --add-repo https://packages.clickhouse.com/rpm/clickhouse.repo ``` - For systems with `zypper` package manager (openSUSE, SLES), run: - ```bash sudo zypper addrepo -r https://packages.clickhouse.com/rpm/clickhouse.repo -g sudo zypper --gpg-auto-import-keys refresh clickhouse-stable ``` - In the steps below, `yum install` can be replaced by `zypper install`, depending on which package manager you are using. - ## Install ClickHouse server and client {#install-clickhouse-server-and-client-1} - To install ClickHouse run the following commands: - ```bash sudo yum install -y clickhouse-server clickhouse-client ``` - - You can replace `stable` with `lts` to use different [release kinds](/knowledgebase/production) based on your needs. - You can download and install packages manually from [packages.clickhouse.com/rpm](https://packages.clickhouse.com/rpm/stable). - To specify a particular version, add `-$version` to the end of the package name, for example: - ```bash sudo yum install clickhouse-server-22.8.7.34 ``` - ## Start ClickHouse server {#start-clickhouse-server-1} - To start ClickHouse server, run: - ```bash sudo systemctl enable clickhouse-server sudo systemctl start clickhouse-server sudo systemctl status clickhouse-server ``` - To start ClickHouse client, run: - ```sql clickhouse-client ``` - If you set up a password for your server, then you will need to run: - ```bash clickhouse-client --password ``` - ## Install standalone ClickHouse Keeper {#install-standalone-clickhouse-keeper-1} - :::tip In production environments we strongly recommend running ClickHouse Keeper on dedicated nodes. -In test environments, if you decide to run ClickHouse Server and ClickHouse Keeper on the same server, +In test environments, if you decide to run ClickHouse Server and ClickHouse Keeper on the same server, then you do not need to install ClickHouse Keeper as it is included with ClickHouse server. ::: - To install `clickhouse-keeper` on standalone ClickHouse Keeper servers, run: - ```bash sudo yum install -y clickhouse-keeper ``` - ## Enable and start ClickHouse Keeper {#enable-and-start-clickhouse-keeper-1} - ```bash sudo systemctl enable clickhouse-keeper sudo systemctl start clickhouse-keeper sudo systemctl status clickhouse-keeper ``` - diff --git a/docs/getting-started/install/_snippets/_windows_install.md b/docs/getting-started/install/_snippets/_windows_install.md index aad7d44ab24..41d68262873 100644 --- a/docs/getting-started/install/_snippets/_windows_install.md +++ b/docs/getting-started/install/_snippets/_windows_install.md @@ -7,79 +7,55 @@ To install ClickHouse on Windows you will need WSL (Windows Subsystem for Linux) ::: - ## Install WSL {#install-wsl} - Open Windows PowerShell as administrator and run the following command: - ```bash wsl --install ``` - You will be prompted to enter a new UNIX username and password. After you have entered your desired username and password you should see a message similar to: - ```bash Welcome to Ubuntu 24.04.1 LTS (GNU/Linux 5.15.133.1-microsoft-WSL2 x86_64) ``` - ## Install ClickHouse via script using curl {#install-clickhouse-via-script-using-curl} - Run the following command to install ClickHouse via script using curl: - ```bash curl https://clickhouse.com/ | sh ``` - If the script has successfully run you will see the message: - ```bash Successfully downloaded the ClickHouse binary, you can run it as: - ./clickhouse +./clickhouse ``` - ## Start clickhouse-local {#start-clickhouse-local} - `clickhouse-local` allows you to process local and remote files using ClickHouse's powerful SQL syntax and without the need for configuration. Table data is stored in a temporary location, meaning that after a restart of `clickhouse-local` previously created tables are no longer available. - Run the following command to start [clickhouse-local](/operations/utilities/clickhouse-local): - ```bash ./clickhouse ``` - ## Start clickhouse-server {#start-clickhouse-server} - Should you wish to persist data, you'll want to run `clickhouse-server`. You can start the ClickHouse server using the following command: - ```bash ./clickhouse server ``` - ## Start clickhouse-client {#start-clickhouse-client} - With the server up and running, open a new terminal window and run the following command to launch `clickhouse-client`: - ```bash ./clickhouse client ``` - You will see something like this: - ```response ./clickhouse client ClickHouse client version 24.5.1.117 (official build). Connecting to localhost:9000 as user default. Connected to ClickHouse server version 24.5.1. - local-host :) ``` - Table data is stored in the current directory and still available after a restart of ClickHouse server. If necessary, you can pass `-C config.xml` as an additional command line argument to `./clickhouse server` @@ -87,7 +63,5 @@ and provide further configuration in a configuration file. All available configuration settings are documented [here](/operations/server-configuration-parameters/settings) and in the [example configuration file template](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/config.xml). - You are now ready to start sending SQL commands to ClickHouse! - diff --git a/docs/getting-started/install/install.mdx b/docs/getting-started/install/install.mdx index 60b17ae5c8a..2845edb1d53 100644 --- a/docs/getting-started/install/install.mdx +++ b/docs/getting-started/install/install.mdx @@ -46,5 +46,3 @@ instructions for Opensource ClickHouse: macos_prod={} docker={} /> - - diff --git a/docs/getting-started/playground.md b/docs/getting-started/playground.md index 90371035308..37d84fa609a 100644 --- a/docs/getting-started/playground.md +++ b/docs/getting-started/playground.md @@ -30,7 +30,7 @@ The queries are executed as a read-only user. It implies some limitations: - DDL queries are not allowed - INSERT queries are not allowed -The service also have quotas on its usage. + The service also have quotas on its usage. ## Examples {#examples} diff --git a/docs/getting-started/quick-start/cloud.mdx b/docs/getting-started/quick-start/cloud.mdx index 8cac4d0ef84..4f7746cddee 100644 --- a/docs/getting-started/quick-start/cloud.mdx +++ b/docs/getting-started/quick-start/cloud.mdx @@ -28,203 +28,132 @@ service in [ClickHouse Cloud](https://console.clickhouse.cloud). In this quick s in three easy steps. - ## Create a ClickHouse service {#1-create-a-clickhouse-service} - To create a free ClickHouse service in [ClickHouse Cloud](https://console.clickhouse.cloud), you just need to sign up by completing the following steps: - - - Create an account on the [sign-up page](https://console.clickhouse.cloud/signUp) - - You can choose to sign up using your email or via Google SSO, Microsoft SSO, AWS Marketplace, Google Cloud or Microsoft Azure - - If you sign up using an email and password, remember to verify your email address within the next 24h via the link you receive in your email - - Login using the username and password you just created - +- Create an account on the [sign-up page](https://console.clickhouse.cloud/signUp) +- You can choose to sign up using your email or via Google SSO, Microsoft SSO, AWS Marketplace, Google Cloud or Microsoft Azure +- If you sign up using an email and password, remember to verify your email address within the next 24h via the link you receive in your email +- Login using the username and password you just created Select Plan -
- Once you are logged in, ClickHouse Cloud starts the onboarding wizard which walks you through creating a new ClickHouse service. You will initially be requested to [select a plan](/cloud/manage/cloud-tiers): - Select Plan -
- :::tip We recommend the Scale tier for most workloads. Further details on tiers can be found [here](/cloud/manage/cloud-tiers) ::: - Selecting a plan requires you to select the desired region in which to deploy your first service. The exact options available will depend on the tier selected. In the step below, we assume that the user has opted for the recommended Scale tier. - Select your desired region for deploying the service, and give your new service a name: - New ClickHouse Service -
- By default, the scale tier will create 3 replicas each with 4 VCPUs and 16 GiB RAM. [Vertical autoscaling](/manage/scaling#vertical-auto-scaling) will be enabled by default in the Scale tier. - Users can customize the service resources if required, specifying a minimum and maximum size for replicas to scale between. When ready, select `Create service`. - Scaling Limits -
- Congratulations! Your ClickHouse Cloud service is up and running and onboarding is complete. Keep reading for details on how to start ingesting and querying your data. - ## Connect to ClickHouse {#2-connect-to-clickhouse} There are 2 ways to connect to ClickHouse: - - Connect using our web-based SQL console - - Connect with your app -
+- Connect using our web-based SQL console +- Connect with your app ### Connect using SQL console {#connect-using-sql-console} - For getting started quickly, ClickHouse provides a web-based SQL console to which you will be redirected on completing onboarding. - SQL Console - - Create a query tab and enter a simple query to verify that your connection is working: - ```sql SHOW databases ``` - You should see 4 databases in the list, plus any that you may have added. - SQL Console
- - That's it - you are ready to start using your new ClickHouse service! - ### Connect with your app {#connect-with-your-app} - Press the connect button from the navigation menu. A modal will open offering the credentials to your service and offering you a set of instructions on how to connect with your interface or language clients. - Service Connect
- If you can't see your language client, you may want to check our list of [Integrations](/integrations). - ## Add data {#3-add-data} - ClickHouse is better with data! There are multiple ways to add data and most of them are available on the Data Sources page, which can be accessed in the navigation menu. - Data sources
- You can upload data using the following methods: - - Setup a ClickPipe to start ingesting data from data sources like S3, Postgres, Kafka, GCS - - Use the SQL console - - Use the ClickHouse client - - Upload a file - accepted formats include JSON, CSV and TSV - - Upload data from file URL - +- Setup a ClickPipe to start ingesting data from data sources like S3, Postgres, Kafka, GCS +- Use the SQL console +- Use the ClickHouse client +- Upload a file - accepted formats include JSON, CSV and TSV +- Upload data from file URL ### ClickPipes {#clickpipes} - [ClickPipes](http://clickhouse.com/docs/integrations/clickpipes) is a managed integration platform that makes ingesting data from a diverse set of sources as simple as clicking a few buttons. Designed for the most demanding workloads, ClickPipes's robust and scalable architecture ensures consistent performance and reliability. ClickPipes can be used for long-term streaming needs or one-time data loading job. - Select data source
- ### Add data using the SQL Console {#add-data-using-the-sql-console} - Like most database management systems, ClickHouse logically groups tables into **databases**. Use the [`CREATE DATABASE`](../../sql-reference/statements/create/database.md) command to create a new database in ClickHouse: - ```sql CREATE DATABASE IF NOT EXISTS helloworld ``` - Run the following command to create a table named `my_first_table` in the `helloworld` database: - ```sql CREATE TABLE helloworld.my_first_table ( - user_id UInt32, - message String, - timestamp DateTime, - metric Float32 +user_id UInt32, +message String, +timestamp DateTime, +metric Float32 ) ENGINE = MergeTree() PRIMARY KEY (user_id, timestamp) ``` - In the example above, `my_first_table` is a [`MergeTree`](../../engines/table-engines/mergetree-family/mergetree.md) table with four columns: - - - `user_id`: a 32-bit unsigned integer ([UInt32](../../sql-reference/data-types/int-uint.md)) - - `message`: a [String](../../sql-reference/data-types/string.md) data type, which replaces types like `VARCHAR`, `BLOB`, `CLOB` and others from other database systems - - `timestamp`: a [DateTime](../../sql-reference/data-types/datetime.md) value, which represents an instant in time - - `metric`: a 32-bit floating point number ([Float32](../../sql-reference/data-types/float.md)) - +- `user_id`: a 32-bit unsigned integer ([UInt32](../../sql-reference/data-types/int-uint.md)) +- `message`: a [String](../../sql-reference/data-types/string.md) data type, which replaces types like `VARCHAR`, `BLOB`, `CLOB` and others from other database systems +- `timestamp`: a [DateTime](../../sql-reference/data-types/datetime.md) value, which represents an instant in time +- `metric`: a 32-bit floating point number ([Float32](../../sql-reference/data-types/float.md)) :::note Table engines Table engines determine: - - How and where data is stored - - Which queries are supported - - Whether or not the data is replicated -
+- How and where data is stored +- Which queries are supported +- Whether or not the data is replicated There are many table engines to choose from, but for a simple table on a single-node ClickHouse server, [`MergeTree`](/engines/table-engines/mergetree-family/mergetree.md) is your likely choice. ::: - #### A Brief Intro to Primary Keys {#a-brief-intro-to-primary-keys} - Before you go any further, it is important to understand how primary keys work in ClickHouse (the implementation of primary keys might seem unexpected!): - - - primary keys in ClickHouse are **_not unique_** for each row in a table - +- primary keys in ClickHouse are **_not unique_** for each row in a table The primary key of a ClickHouse table determines how the data is sorted when written to disk. Every 8,192 rows or 10MB of data (referred to as the **index granularity**) creates an entry in the primary key index file. This granularity concept creates a **sparse index** that can easily fit in memory, and the granules represent a stripe of the smallest amount of column data that gets processed during `SELECT` queries. - The primary key can be defined using the `PRIMARY KEY` parameter. If you define a table without a `PRIMARY KEY` specified, then the key becomes the tuple specified in the `ORDER BY` clause. If you specify both a `PRIMARY KEY` and an `ORDER BY`, the primary key must be a subset of the sort order. - The primary key is also the sorting key, which is a tuple of `(user_id, timestamp)`. Therefore, the data stored in each column file will be sorted by `user_id`, then `timestamp`. - For a deep dive into core ClickHouse concepts, see ["Core Concepts"](../../managing-data/core-concepts/index.md). - #### Insert data into your table {#insert-data-into-your-table} - You can use the familiar [`INSERT INTO TABLE`](../../sql-reference/statements/insert-into.md) command with ClickHouse, but it is important to understand that each insert into a [`MergeTree`](/engines/table-engines/mergetree-family/mergetree.md) table causes a **part** to be created in storage. - :::tip ClickHouse best practice Insert a large number of rows per batch - tens of thousands or even millions of rows at once. Don't worry - ClickHouse can easily handle that type of volume - and it will [save you money](/best-practices/selecting-an-insert-strategy#batch-inserts-if-synchronous) by sending fewer write requests to your service. ::: -
- Even for a simple example, let's insert more than one row at a time: - ```sql INSERT INTO helloworld.my_first_table (user_id, message, timestamp, metric) VALUES - (101, 'Hello, ClickHouse!', now(), -1.0 ), - (102, 'Insert a lot of rows per batch', yesterday(), 1.41421 ), - (102, 'Sort your data based on your commonly-used queries', today(), 2.718 ), - (101, 'Granules are the smallest chunks of data read', now() + 5, 3.14159 ) +(101, 'Hello, ClickHouse!', now(), -1.0 ), +(102, 'Insert a lot of rows per batch', yesterday(), 1.41421 ), +(102, 'Sort your data based on your commonly-used queries', today(), 2.718 ), +(101, 'Granules are the smallest chunks of data read', now() + 5, 3.14159 ) ``` - :::note Notice the `timestamp` column is populated using various [**Date**](../../sql-reference/data-types/date.md) and [**DateTime**](../../sql-reference/data-types/datetime.md) functions. ClickHouse has hundreds of useful functions that you can [view in the **Functions** section](/sql-reference/functions/). ::: - Let's verify it worked: - ```sql SELECT * FROM helloworld.my_first_table ``` - ### Add data using the ClickHouse Client {#add-data-using-the-clickhouse-client} - You can also connect to your ClickHouse Cloud service using a command-line tool named [**clickhouse client**](/interfaces/cli). Click `Connect` on the left menu to access these details. From the dialog select `Native` from the drop-down: - clickhouse client connection details
- 1. Install [ClickHouse](/interfaces/cli). - 2. Run the command, substituting your hostname, username, and password: - ```bash ./clickhouse client --host HOSTNAME.REGION.CSP.clickhouse.cloud \ --secure --port 9440 \ @@ -235,19 +164,13 @@ If you get the smiley face prompt, you are ready to run queries! ```response :) ``` - 3. Give it a try by running the following query: - -
- ```sql SELECT * FROM helloworld.my_first_table ORDER BY timestamp ``` - Notice the response comes back in a nice table format: - ```response ┌─user_id─┬─message────────────────────────────────────────────┬───────────timestamp─┬──metric─┐ │ 102 │ Insert a lot of rows per batch │ 2022-03-21 00:00:00 │ 1.41421 │ @@ -255,14 +178,9 @@ Notice the response comes back in a nice table format: │ 101 │ Hello, ClickHouse! │ 2022-03-22 14:04:09 │ -1 │ │ 101 │ Granules are the smallest chunks of data read │ 2022-03-22 14:04:14 │ 3.14159 │ └─────────┴────────────────────────────────────────────────────┴─────────────────────┴─────────┘ - 4 rows in set. Elapsed: 0.008 sec. ``` - 4. Add a [`FORMAT`](../../sql-reference/statements/select/format.md) clause to specify one of the [many supported output formats of ClickHouse](/interfaces/formats/): - -
- ```sql SELECT * FROM helloworld.my_first_table @@ -272,41 +190,27 @@ FORMAT TabSeparated In the above query, the output is returned as tab-separated: ```response Query id: 3604df1c-acfd-4117-9c56-f86c69721121 - 102 Insert a lot of rows per batch 2022-03-21 00:00:00 1.41421 102 Sort your data based on your commonly-used queries 2022-03-22 00:00:00 2.718 101 Hello, ClickHouse! 2022-03-22 14:04:09 -1 101 Granules are the smallest chunks of data read 2022-03-22 14:04:14 3.14159 - 4 rows in set. Elapsed: 0.005 sec. ``` - 5. To exit the `clickhouse client`, enter the **exit** command: - -
- ```bash exit ``` - ### Upload a File {#upload-a-file} - A common task when getting started with a database is to insert some data that you already have in files. We have some sample data online that you can insert that represents clickstream data - it includes a user ID, a URL that was visited, and the timestamp of the event. - Suppose we have the following text in a CSV file named `data.csv`: - ```bash title="data.csv" 102,This is data in a file,2022-02-22 10:43:28,123.45 101,It is comma-separated,2022-02-23 00:00:00,456.78 103,Use FORMAT to specify the format,2022-02-21 10:43:30,678.90 ``` - 1. The following command inserts the data into `my_first_table`: - -
- ```bash ./clickhouse client --host HOSTNAME.REGION.CSP.clickhouse.cloud \ --secure --port 9440 \ @@ -314,14 +218,8 @@ Suppose we have the following text in a CSV file named `data.csv`: --password \ --query='INSERT INTO helloworld.my_first_table FORMAT CSV' < data.csv ``` - 2. Notice the new rows appear in the table now if querying from the SQL console: - -
- New rows from CSV file -
-
## What's Next? {#whats-next} diff --git a/docs/getting-started/quick-start/oss.mdx b/docs/getting-started/quick-start/oss.mdx index ea8d146b838..299e074f5f1 100644 --- a/docs/getting-started/quick-start/oss.mdx +++ b/docs/getting-started/quick-start/oss.mdx @@ -21,122 +21,91 @@ learn to run ClickHouse server, and use the ClickHouse client to create a table, then insert data into it and run a query to select that data. - ## Download ClickHouse {#download-the-binary} - ClickHouse runs natively on Linux, FreeBSD and macOS, and runs on Windows via the [WSL](https://learn.microsoft.com/en-us/windows/wsl/about). The simplest way to download ClickHouse locally is to run the following `curl` command. It determines if your operating system is supported, then downloads an appropriate ClickHouse binary. - :::note We recommend running the command below from a new and empty subdirectory as some configuration files will be created in the directory the binary is located in the first time ClickHouse server is run. ::: - ```bash curl https://clickhouse.com/ | sh ``` - You should see: - ``` Successfully downloaded the ClickHouse binary, you can run it as: - ./clickhouse - +./clickhouse You can also install it: sudo ./clickhouse install ``` - At this stage, you can ignore the prompt to run the `install` command. - :::note For Mac users: If you are getting errors that the developer of the binary cannot be verified, please see ["Fix the Developer Verification Error in MacOS"](https://clickhouse.com/docs/knowledgebase/fix-developer-verification-error-in-macos). ::: - - ## Start the server - Run the following command to start the ClickHouse server: - ```bash ./clickhouse server ``` - You should see the terminal fill up with logging. This is expected. In ClickHouse the [default logging level](https://clickhouse.com/docs/knowledgebase/why_default_logging_verbose) is set to `trace` rather than `warning`. - ## Start the client - Use `clickhouse-client` to connect to your ClickHouse service. Open a new terminal, change directories to where your `clickhouse` binary is saved, and run the following command: - ```bash ./clickhouse client ``` - You should see a smiling face as it connects to your service running on localhost: - ```response my-host :) ``` - ## Create a table - Use `CREATE TABLE` to define a new table. Typical SQL DDL commands work in ClickHouse with one addition - tables in ClickHouse require an `ENGINE` clause. Use [`MergeTree`](/engines/table-engines/mergetree-family/mergetree) to take advantage of the performance benefits of ClickHouse: - ```sql CREATE TABLE my_first_table ( - user_id UInt32, - message String, - timestamp DateTime, - metric Float32 +user_id UInt32, +message String, +timestamp DateTime, +metric Float32 ) ENGINE = MergeTree PRIMARY KEY (user_id, timestamp) ``` - ## Insert data - You can use the familiar `INSERT INTO TABLE` command with ClickHouse, but it is important to understand that each insert into a `MergeTree` table causes what we call a **part** in ClickHouse to be created in storage. These parts later get merged in the background by ClickHouse. - In ClickHouse, we try to bulk insert lots of rows at a time (tens of thousands or even millions at once) to minimize the number of [**parts**](/parts) that need to get merged in the background process. - In this guide, we won't worry about that just yet. Run the following command to insert a few rows of data into your table: - ```sql INSERT INTO my_first_table (user_id, message, timestamp, metric) VALUES - (101, 'Hello, ClickHouse!', now(), -1.0 ), - (102, 'Insert a lot of rows per batch', yesterday(), 1.41421 ), - (102, 'Sort your data based on your commonly-used queries', today(), 2.718 ), - (101, 'Granules are the smallest chunks of data read', now() + 5, 3.14159 ) +(101, 'Hello, ClickHouse!', now(), -1.0 ), +(102, 'Insert a lot of rows per batch', yesterday(), 1.41421 ), +(102, 'Sort your data based on your commonly-used queries', today(), 2.718 ), +(101, 'Granules are the smallest chunks of data read', now() + 5, 3.14159 ) ``` - ## Query your new table - You can write a `SELECT` query just like you would with any SQL database: - ```sql SELECT * FROM my_first_table ORDER BY timestamp ``` Notice the response comes back in a nice table format: - ```text ┌─user_id─┬─message────────────────────────────────────────────┬───────────timestamp─┬──metric─┐ │ 102 │ Insert a lot of rows per batch │ 2022-03-21 00:00:00 │ 1.41421 │ @@ -144,227 +113,175 @@ Notice the response comes back in a nice table format: │ 101 │ Hello, ClickHouse! │ 2022-03-22 14:04:09 │ -1 │ │ 101 │ Granules are the smallest chunks of data read │ 2022-03-22 14:04:14 │ 3.14159 │ └─────────┴────────────────────────────────────────────────────┴─────────────────────┴─────────┘ - 4 rows in set. Elapsed: 0.008 sec. ``` - ## Insert your own data - The next step is to get your own data into ClickHouse. We have lots of [table functions](/sql-reference/table-functions/index.md) and [integrations](/integrations) for ingesting data. We have some examples in the tabs below, or you can check out our [Integrations](/integrations) page for a long list of technologies that integrate with ClickHouse. - - - - Use the [`s3` table function](/sql-reference/table-functions/s3.md) to - read files from S3. It's a table function - meaning that the result is a table - that can be: - - 1. used as the source of a `SELECT` query (allowing you to run ad-hoc queries and - leave your data in S3), or... - 2. insert the resulting table into a `MergeTree` table (when you are ready to - move your data into ClickHouse) - - An ad-hoc query looks like: - - ```sql - SELECT - passenger_count, - avg(toFloat32(total_amount)) - FROM s3( - 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_0.gz', - 'TabSeparatedWithNames' - ) - GROUP BY passenger_count - ORDER BY passenger_count; - ``` - - Moving the data into a ClickHouse table looks like the following, where - `nyc_taxi` is a `MergeTree` table: - - ```sql - INSERT INTO nyc_taxi - SELECT * FROM s3( - 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_0.gz', - 'TabSeparatedWithNames' - ) - SETTINGS input_format_allow_errors_num=25000; - ``` - - View our [collection of AWS S3 documentation pages](/integrations/data-ingestion/s3/index.md) for lots more details and examples of using S3 with ClickHouse. -
-
- - - The [`s3` table function](/sql-reference/table-functions/s3.md) used for - reading data in AWS S3 also works on files in Google Cloud Storage. - - For example: - - ```sql - SELECT - * - FROM s3( - 'https://storage.googleapis.com/my-bucket/trips.parquet', - 'MY_GCS_HMAC_KEY', - 'MY_GCS_HMAC_SECRET_KEY', - 'Parquet' - ) - LIMIT 1000 - ``` - - Find more details on the [`s3` table function page](/sql-reference/table-functions/s3.md). -
-
- - - The [`url` table function](/sql-reference/table-functions/url) reads - files accessible from the web: - - ```sql - --By default, ClickHouse prevents redirects to protect from SSRF attacks. - --The URL below requires a redirect, so we must set max_http_get_redirects > 0. - SET max_http_get_redirects=10; - - SELECT * - FROM url( - 'http://prod2.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv', - 'CSV' - ); - ``` - - Find more details on the [`url` table function page](/sql-reference/table-functions/url). -
-
- - - Use the [`file` table engine](/sql-reference/table-functions/file) to - read a local file. For simplicity, copy the file to the `user_files` directory - (which is found in the directory where you downloaded the ClickHouse binary). - - ```sql - DESCRIBE TABLE file('comments.tsv') - - Query id: 8ca9b2f9-65a2-4982-954a-890de710a336 - - ┌─name──────┬─type────────────────────┐ - │ id │ Nullable(Int64) │ - │ type │ Nullable(String) │ - │ author │ Nullable(String) │ - │ timestamp │ Nullable(DateTime64(9)) │ - │ comment │ Nullable(String) │ - │ children │ Array(Nullable(Int64)) │ - └───────────┴─────────────────────────┘ - ``` - - Notice ClickHouse infers the names and data types of your columns by analyzing a - large batch of rows. If ClickHouse can not determine the file format from the - filename, you can specify it as the second argument: - - ```sql - SELECT count() - FROM file( - 'comments.tsv', - 'TabSeparatedWithNames' - ) - ``` - - View the [`file` table function](/sql-reference/table-functions/file) - docs page for more details. -
-
- - - Use the [`postgresql` table function](/sql-reference/table-functions/postgresql) - to read data from a table in PostgreSQL: - - ```sql - SELECT * - FROM - postgresql( - 'localhost:5432', - 'my_database', - 'my_table', - 'postgresql_user', - 'password') - ; - ``` - - View the [`postgresql` table function](/sql-reference/table-functions/postgresql) - docs page for more details. -
-
- - - Use the [`mysql` table function](/sql-reference/table-functions/mysql) - to read data from a table in MySQL: - - ```sql - SELECT * - FROM - mysql( - 'localhost:3306', - 'my_database', - 'my_table', - 'mysql_user', - 'password') - ; - ``` - - View the [`mysql` table function](/sql-reference/table-functions/mysql) - docs page for more details. -
-
- - - ClickHouse can read data from any ODBC or JDBC data source: - - ```sql - SELECT * - FROM - odbc( - 'DSN=mysqlconn', - 'my_database', - 'my_table' - ); - ``` - - View the [`odbc` table function](/sql-reference/table-functions/odbc) - and the [`jdbc` table function](/sql-reference/table-functions/jdbc) docs - pages for more details. -
-
- - - Message queues can stream data into ClickHouse using the corresponding table - engine, including: - - - **Kafka**: integrate with Kafka using the [`Kafka` table engine](/engines/table-engines/integrations/kafka) - - **Amazon MSK**: integrate with [Amazon Managed Streaming for Apache Kafka (MSK)](/integrations/kafka/cloud/amazon-msk/) - - **RabbitMQ**: integrate with RabbitMQ using the [`RabbitMQ` table engine](/engines/table-engines/integrations/rabbitmq) -
-
- - - ClickHouse has table functions to read data from the following sources: - - - **Hadoop**: integrate with Apache Hadoop using the [`hdfs` table function](/sql-reference/table-functions/hdfs) - - **Hudi**: read from existing Apache Hudi tables in S3 using the [`hudi` table function](/sql-reference/table-functions/hudi) - - **Iceberg**: read from existing Apache Iceberg tables in S3 using the [`iceberg` table function](/sql-reference/table-functions/iceberg) - - **DeltaLake**: read from existing Delta Lake tables in S3 using the [`deltaLake` table function](/sql-reference/table-functions/deltalake) -
-
- - - Check out our [long list of ClickHouse integrations](/integrations) to find how to connect your existing frameworks and data sources to ClickHouse. -
-
+ +Use the [`s3` table function](/sql-reference/table-functions/s3.md) to +read files from S3. It's a table function - meaning that the result is a table +that can be: +1. used as the source of a `SELECT` query (allowing you to run ad-hoc queries and +leave your data in S3), or... +2. insert the resulting table into a `MergeTree` table (when you are ready to +move your data into ClickHouse) +An ad-hoc query looks like: +```sql +SELECT +passenger_count, +avg(toFloat32(total_amount)) +FROM s3( +'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_0.gz', +'TabSeparatedWithNames' +) +GROUP BY passenger_count +ORDER BY passenger_count; +``` +Moving the data into a ClickHouse table looks like the following, where +`nyc_taxi` is a `MergeTree` table: +```sql +INSERT INTO nyc_taxi +SELECT * FROM s3( +'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_0.gz', +'TabSeparatedWithNames' +) +SETTINGS input_format_allow_errors_num=25000; +``` +View our [collection of AWS S3 documentation pages](/integrations/data-ingestion/s3/index.md) for lots more details and examples of using S3 with ClickHouse. + + +The [`s3` table function](/sql-reference/table-functions/s3.md) used for +reading data in AWS S3 also works on files in Google Cloud Storage. +For example: +```sql +SELECT +* +FROM s3( +'https://storage.googleapis.com/my-bucket/trips.parquet', +'MY_GCS_HMAC_KEY', +'MY_GCS_HMAC_SECRET_KEY', +'Parquet' +) +LIMIT 1000 +``` +Find more details on the [`s3` table function page](/sql-reference/table-functions/s3.md). + + +The [`url` table function](/sql-reference/table-functions/url) reads +files accessible from the web: +```sql +--By default, ClickHouse prevents redirects to protect from SSRF attacks. +--The URL below requires a redirect, so we must set max_http_get_redirects > 0. +SET max_http_get_redirects=10; +SELECT * +FROM url( +'http://prod2.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv', +'CSV' +); +``` +Find more details on the [`url` table function page](/sql-reference/table-functions/url). + + +Use the [`file` table engine](/sql-reference/table-functions/file) to +read a local file. For simplicity, copy the file to the `user_files` directory +(which is found in the directory where you downloaded the ClickHouse binary). +```sql +DESCRIBE TABLE file('comments.tsv') +Query id: 8ca9b2f9-65a2-4982-954a-890de710a336 +┌─name──────┬─type────────────────────┐ +│ id │ Nullable(Int64) │ +│ type │ Nullable(String) │ +│ author │ Nullable(String) │ +│ timestamp │ Nullable(DateTime64(9)) │ +│ comment │ Nullable(String) │ +│ children │ Array(Nullable(Int64)) │ +└───────────┴─────────────────────────┘ +``` +Notice ClickHouse infers the names and data types of your columns by analyzing a +large batch of rows. If ClickHouse can not determine the file format from the +filename, you can specify it as the second argument: +```sql +SELECT count() +FROM file( +'comments.tsv', +'TabSeparatedWithNames' +) +``` +View the [`file` table function](/sql-reference/table-functions/file) +docs page for more details. + + +Use the [`postgresql` table function](/sql-reference/table-functions/postgresql) +to read data from a table in PostgreSQL: +```sql +SELECT * +FROM +postgresql( +'localhost:5432', +'my_database', +'my_table', +'postgresql_user', +'password') +; +``` +View the [`postgresql` table function](/sql-reference/table-functions/postgresql) +docs page for more details. + + +Use the [`mysql` table function](/sql-reference/table-functions/mysql) +to read data from a table in MySQL: +```sql +SELECT * +FROM +mysql( +'localhost:3306', +'my_database', +'my_table', +'mysql_user', +'password') +; +``` +View the [`mysql` table function](/sql-reference/table-functions/mysql) +docs page for more details. + + +ClickHouse can read data from any ODBC or JDBC data source: +```sql +SELECT * +FROM +odbc( +'DSN=mysqlconn', +'my_database', +'my_table' +); +``` +View the [`odbc` table function](/sql-reference/table-functions/odbc) +and the [`jdbc` table function](/sql-reference/table-functions/jdbc) docs +pages for more details. + + +Message queues can stream data into ClickHouse using the corresponding table +engine, including: +- **Kafka**: integrate with Kafka using the [`Kafka` table engine](/engines/table-engines/integrations/kafka) +- **Amazon MSK**: integrate with [Amazon Managed Streaming for Apache Kafka (MSK)](/integrations/kafka/cloud/amazon-msk/) +- **RabbitMQ**: integrate with RabbitMQ using the [`RabbitMQ` table engine](/engines/table-engines/integrations/rabbitmq) + + +ClickHouse has table functions to read data from the following sources: +- **Hadoop**: integrate with Apache Hadoop using the [`hdfs` table function](/sql-reference/table-functions/hdfs) +- **Hudi**: read from existing Apache Hudi tables in S3 using the [`hudi` table function](/sql-reference/table-functions/hudi) +- **Iceberg**: read from existing Apache Iceberg tables in S3 using the [`iceberg` table function](/sql-reference/table-functions/iceberg) +- **DeltaLake**: read from existing Delta Lake tables in S3 using the [`deltaLake` table function](/sql-reference/table-functions/deltalake) + + +Check out our [long list of ClickHouse integrations](/integrations) to find how to connect your existing frameworks and data sources to ClickHouse. +
- ## Explore - - Check out our [Core Concepts](/managing-data/core-concepts) section to learn some of the fundamentals of how ClickHouse works under the hood. - Check out the [Advanced Tutorial](tutorial.md) which takes a much deeper dive into the key concepts and capabilities of ClickHouse. - Continue your learning by taking our free on-demand training courses at the [ClickHouse Academy](https://learn.clickhouse.com/visitor_class_catalog). @@ -372,6 +289,4 @@ technologies that integrate with ClickHouse. - If your data is coming from an external source, view our [collection of integration guides](/integrations/) for connecting to message queues, databases, pipelines and more. - If you are using a UI/BI visualization tool, view the [user guides for connecting a UI to ClickHouse](/integrations/data-visualization/). - The user guide on [primary keys](/guides/best-practices/sparse-primary-indexes.md) is everything you need to know about primary keys and how to define them. -
- diff --git a/docs/guides/best-practices/avoidmutations.md b/docs/guides/best-practices/avoidmutations.md index f59327ce3f1..c9a2af3a42f 100644 --- a/docs/guides/best-practices/avoidmutations.md +++ b/docs/guides/best-practices/avoidmutations.md @@ -8,4 +8,3 @@ description: 'Mutations refers to ALTER queries that manipulate table data' import Content from '@site/docs/best-practices/_snippets/_avoid_mutations.md'; - diff --git a/docs/guides/best-practices/avoidoptimizefinal.md b/docs/guides/best-practices/avoidoptimizefinal.md index 20c8daa5e5d..55529f979aa 100644 --- a/docs/guides/best-practices/avoidoptimizefinal.md +++ b/docs/guides/best-practices/avoidoptimizefinal.md @@ -8,4 +8,3 @@ description: 'Using the OPTIMIZE TABLE ... FINAL query will initiate an unschedu import Content from '@site/docs/best-practices/_snippets/_avoid_optimize_final.md'; - diff --git a/docs/guides/best-practices/index.md b/docs/guides/best-practices/index.md index 6ff0bd04c5f..be2842736e7 100644 --- a/docs/guides/best-practices/index.md +++ b/docs/guides/best-practices/index.md @@ -7,8 +7,8 @@ title: 'Performance and Optimizations' # Performance and optimizations -This section contains tips and best practices for improving performance with ClickHouse. -We recommend users read [Core Concepts](/parts) as a precursor to this section, +This section contains tips and best practices for improving performance with ClickHouse. +We recommend users read [Core Concepts](/parts) as a precursor to this section, which covers the main concepts required to improve performance. | Topic | Description | diff --git a/docs/guides/best-practices/prewhere.md b/docs/guides/best-practices/prewhere.md index 0b24b0218dd..7c88ebcf9e2 100644 --- a/docs/guides/best-practices/prewhere.md +++ b/docs/guides/best-practices/prewhere.md @@ -20,7 +20,6 @@ The [PREWHERE clause](/sql-reference/statements/select/prewhere) is a query exec This guide explains how PREWHERE works, how to measure its impact, and how to tune it for best performance. - ## Query processing without PREWHERE optimization {#query-processing-without-prewhere-optimization} We'll start by illustrating how a query on the [uk_price_paid_simple](/parts) table is processed without using PREWHERE: @@ -34,13 +33,12 @@ We'll start by illustrating how a query on the [uk_price_paid_simple](/parts) ta ③ It scans the index entries to identify which granules from the town column might contain rows matching the predicate. -④ These potentially relevant granules are loaded into memory, along with positionally aligned granules from any other columns needed for the query. +④ These potentially relevant granules are loaded into memory, along with positionally aligned granules from any other columns needed for the query. ⑤ The remaining filters are then applied during query execution. As you can see, without PREWHERE, all potentially relevant columns are loaded before filtering, even if only a few rows actually match. - ## How PREWHERE improves query efficiency {#how-prewhere-improves-query-efficiency} The following animations show how the query from above is processed with a PREWHERE clause applied to all query predicates. @@ -100,7 +98,6 @@ ClickHouse follows this strategy by default as of version [23.2](https://clickho Starting with version [23.11](https://clickhouse.com/blog/clickhouse-release-23-11#column-statistics-for-prewhere), optional column statistics can further improve this by choosing the filter processing order based on actual data selectivity, not just column size. - ## How to measure PREWHERE impact {#how-to-measure-prewhere-impact} To validate that PREWHERE is helping your queries, you can compare query performance with and without the `optimize_move_to_prewhere setting` enabled. @@ -122,86 +119,86 @@ SETTINGS optimize_move_to_prewhere = false; 1. │ MOYSER ROAD │ 2. │ AVENUE ROAD │ 3. │ AVENUE ROAD │ - └─────────────┘ + └─────────────┘ -3 rows in set. Elapsed: 0.056 sec. Processed 2.31 million rows, 23.36 MB (41.09 million rows/s., 415.43 MB/s.) -Peak memory usage: 132.10 MiB. -``` + 3 rows in set. Elapsed: 0.056 sec. Processed 2.31 million rows, 23.36 MB (41.09 million rows/s., 415.43 MB/s.) + Peak memory usage: 132.10 MiB. + ``` -ClickHouse read **23.36 MB** of column data while processing 2.31 million rows for the query. + ClickHouse read **23.36 MB** of column data while processing 2.31 million rows for the query. -Next, we run the query with the `optimize_move_to_prewhere` setting enabled. (Note that this setting is optional, as the setting is enabled by default): -```sql -SELECT + Next, we run the query with the `optimize_move_to_prewhere` setting enabled. (Note that this setting is optional, as the setting is enabled by default): + ```sql + SELECT street -FROM - uk.uk_price_paid_simple -WHERE - town = 'LONDON' AND date > '2024-12-31' AND price < 10_000 -SETTINGS optimize_move_to_prewhere = true; -``` - -```txt - ┌─street──────┐ + FROM + uk.uk_price_paid_simple + WHERE + town = 'LONDON' AND date > '2024-12-31' AND price < 10_000 + SETTINGS optimize_move_to_prewhere = true; + ``` + + ```txt + ┌─street──────┐ 1. │ MOYSER ROAD │ 2. │ AVENUE ROAD │ 3. │ AVENUE ROAD │ - └─────────────┘ + └─────────────┘ -3 rows in set. Elapsed: 0.017 sec. Processed 2.31 million rows, 6.74 MB (135.29 million rows/s., 394.44 MB/s.) -Peak memory usage: 132.11 MiB. -``` + 3 rows in set. Elapsed: 0.017 sec. Processed 2.31 million rows, 6.74 MB (135.29 million rows/s., 394.44 MB/s.) + Peak memory usage: 132.11 MiB. + ``` -The same number of rows was processed (2.31 million), but thanks to PREWHERE, ClickHouse read over three times less column data—just 6.74 MB instead of 23.36 MB—which cut the total runtime by a factor of 3. + The same number of rows was processed (2.31 million), but thanks to PREWHERE, ClickHouse read over three times less column data—just 6.74 MB instead of 23.36 MB—which cut the total runtime by a factor of 3. -For deeper insight into how ClickHouse applies PREWHERE behind the scenes, use EXPLAIN and trace logs. + For deeper insight into how ClickHouse applies PREWHERE behind the scenes, use EXPLAIN and trace logs. -We inspect the query's logical plan using the [EXPLAIN](/sql-reference/statements/explain#explain-plan) clause: -```sql -EXPLAIN PLAN actions = 1 -SELECT + We inspect the query's logical plan using the [EXPLAIN](/sql-reference/statements/explain#explain-plan) clause: + ```sql + EXPLAIN PLAN actions = 1 + SELECT street -FROM - uk.uk_price_paid_simple -WHERE - town = 'LONDON' and date > '2024-12-31' and price < 10_000; -``` - -```txt -... -Prewhere info - Prewhere filter column: - and(greater(__table1.date, '2024-12-31'_String), - less(__table1.price, 10000_UInt16), - equals(__table1.town, 'LONDON'_String)) -... -``` - -We omit most of the plan output here, as it's quite verbose. In essence, it shows that all three column predicates were automatically moved to PREWHERE. - -When reproducing this yourself, you'll also see in the query plan that the order of these predicates is based on the columns' data type sizes. Since we haven't enabled column statistics, ClickHouse uses size as the fallback for determining the PREWHERE processing order. - -If you want to go even further under the hood, you can observe each individual PREWHERE processing step by instructing ClickHouse to return all test-level log entries during query execution: -```sql -SELECT + FROM + uk.uk_price_paid_simple + WHERE + town = 'LONDON' and date > '2024-12-31' and price < 10_000; + ``` + + ```txt + ... + Prewhere info + Prewhere filter column: + and(greater(__table1.date, '2024-12-31'_String), + less(__table1.price, 10000_UInt16), + equals(__table1.town, 'LONDON'_String)) + ... + ``` + + We omit most of the plan output here, as it's quite verbose. In essence, it shows that all three column predicates were automatically moved to PREWHERE. + + When reproducing this yourself, you'll also see in the query plan that the order of these predicates is based on the columns' data type sizes. Since we haven't enabled column statistics, ClickHouse uses size as the fallback for determining the PREWHERE processing order. + + If you want to go even further under the hood, you can observe each individual PREWHERE processing step by instructing ClickHouse to return all test-level log entries during query execution: + ```sql + SELECT street -FROM - uk.uk_price_paid_simple -WHERE - town = 'LONDON' AND date > '2024-12-31' AND price < 10_000 -SETTINGS send_logs_level = 'test'; -``` - -```txt -... - ... Condition greater(date, '2024-12-31'_String) moved to PREWHERE - ... Condition less(price, 10000_UInt16) moved to PREWHERE - ... Condition equals(town, 'LONDON'_String) moved to PREWHERE -... - ... Executing prewhere actions on block: greater(__table1.date, '2024-12-31'_String) - ... Executing prewhere actions on block: less(__table1.price, 10000_UInt16) -... -``` + FROM + uk.uk_price_paid_simple + WHERE + town = 'LONDON' AND date > '2024-12-31' AND price < 10_000 + SETTINGS send_logs_level = 'test'; + ``` + + ```txt + ... + ... Condition greater(date, '2024-12-31'_String) moved to PREWHERE + ... Condition less(price, 10000_UInt16) moved to PREWHERE + ... Condition equals(town, 'LONDON'_String) moved to PREWHERE + ... + ... Executing prewhere actions on block: greater(__table1.date, '2024-12-31'_String) + ... Executing prewhere actions on block: less(__table1.price, 10000_UInt16) + ... + ``` ## Key takeaways {#key-takeaways} diff --git a/docs/guides/best-practices/query-optimization.md b/docs/guides/best-practices/query-optimization.md index 842cc5bb450..e9cf1192a84 100644 --- a/docs/guides/best-practices/query-optimization.md +++ b/docs/guides/best-practices/query-optimization.md @@ -8,54 +8,53 @@ description: 'A simple guide for query optimization that describe common path to import queryOptimizationDiagram1 from '@site/static/images/guides/best-practices/query_optimization_diagram_1.png'; import Image from '@theme/IdealImage'; - # A simple guide for query optimization This section aims to illustrate through common scenarios how to use different performance and optimization techniques, such as [analyzer](/operations/analyzer), [query profiling](/operations/optimizing-performance/sampling-query-profiler) or [avoid nullable Columns](/optimize/avoid-nullable-columns), in order to improve your ClickHouse query performances. ## Understand query performance {#understand-query-performance} -The best moment to think about performance optimization is when you're setting up your [data schema](/data-modeling/schema-design) before ingesting data into ClickHouse for the first time.  +The best moment to think about performance optimization is when you're setting up your [data schema](/data-modeling/schema-design) before ingesting data into ClickHouse for the first time. -But let's be honest; it is difficult to predict how much your data will grow or what types of queries will be executed.  +But let's be honest; it is difficult to predict how much your data will grow or what types of queries will be executed. If you have an existing deployment with a few queries that you want to improve, the first step is understanding how those queries perform and why some execute in a few milliseconds while others take longer. -ClickHouse has a rich set of tools to help you understand how your query is getting executed and the resources consumed to perform the execution.  +ClickHouse has a rich set of tools to help you understand how your query is getting executed and the resources consumed to perform the execution. -In this section, we will look at those tools and how to use them.  +In this section, we will look at those tools and how to use them. ## General considerations {#general-considerations} -To understand query performance, let's look at what happens in ClickHouse when a query is executed.  +To understand query performance, let's look at what happens in ClickHouse when a query is executed. -The following part is deliberately simplified and takes some shortcuts; the idea here is not to drown you with details but to get you up to speed with the basic concepts. For more information you can read about [query analyzer](/operations/analyzer).  +The following part is deliberately simplified and takes some shortcuts; the idea here is not to drown you with details but to get you up to speed with the basic concepts. For more information you can read about [query analyzer](/operations/analyzer). -From a very high-level standpoint, when ClickHouse executes a query, the following happens:  +From a very high-level standpoint, when ClickHouse executes a query, the following happens: - - **Query parsing and analysis** +- **Query parsing and analysis** -The query is parsed and analyzed, and a generic query execution plan is created.  +The query is parsed and analyzed, and a generic query execution plan is created. - - **Query optimization** +- **Query optimization** -The query execution plan is optimized, unnecessary data is pruned, and a query pipeline is built from the query plan.  +The query execution plan is optimized, unnecessary data is pruned, and a query pipeline is built from the query plan. - - **Query pipeline execution** +- **Query pipeline execution** -The data is read and processed in parallel. This is the stage where ClickHouse actually executes the query operations such as filtering, aggregations, and sorting.  +The data is read and processed in parallel. This is the stage where ClickHouse actually executes the query operations such as filtering, aggregations, and sorting. - - **Final processing** +- **Final processing** The results are merged, sorted, and formatted into a final result before being sent to the client. -In reality, many [optimizations](/concepts/why-clickhouse-is-so-fast) are taking place, and we will discuss them a bit more in this guide, but for now, those main concepts give us a good understanding of what is happening behind the scenes when ClickHouse executes a query.  +In reality, many [optimizations](/concepts/why-clickhouse-is-so-fast) are taking place, and we will discuss them a bit more in this guide, but for now, those main concepts give us a good understanding of what is happening behind the scenes when ClickHouse executes a query. -With this high-level understanding, let's examine the tooling ClickHouse provides and how we can use it to track the metrics that affect query performance.  +With this high-level understanding, let's examine the tooling ClickHouse provides and how we can use it to track the metrics that affect query performance. ## Dataset {#dataset} -We'll use a real example to illustrate how we approach query performances.  +We'll use a real example to illustrate how we approach query performances. Let's use the NYC Taxi dataset, which contains taxi ride data in NYC. First, we start by ingesting the NYC taxi dataset with no optimization. @@ -107,11 +106,11 @@ ORDER BY tuple() ### Query logs {#query-logs} -By default, ClickHouse collects and logs information about each executed query in the [query logs](/operations/system-tables/query_log). This data is stored in the table `system.query_log`.  +By default, ClickHouse collects and logs information about each executed query in the [query logs](/operations/system-tables/query_log). This data is stored in the table `system.query_log`. -For each executed query, ClickHouse logs statistics such as query execution time, number of rows read, and resource usage, such as CPU, memory usage, or filesystem cache hits.  +For each executed query, ClickHouse logs statistics such as query execution time, number of rows read, and resource usage, such as CPU, memory usage, or filesystem cache hits. -Therefore, the query log is a good place to start when investigating slow queries. You can easily spot the queries that take a long time to execute and display the resource usage information for each one.  +Therefore, the query log is a good place to start when investigating slow queries. You can easily spot the queries that take a long time to execute and display the resource usage information for each one. Let's find the top five long-running queries on our NYC taxi dataset. @@ -214,9 +213,9 @@ read_rows: 329044175 tables: ['nyc_taxi.trips_small_inferred'] ``` -The field `query_duration_ms` indicates how long it took for that particular query to execute. Looking at the results from the query logs, we can see that the first query is taking 2967ms to run, which could be improved.  +The field `query_duration_ms` indicates how long it took for that particular query to execute. Looking at the results from the query logs, we can see that the first query is taking 2967ms to run, which could be improved. -You might also want to know which queries are stressing the system by examining the query that consumes the most memory or CPU.  +You might also want to know which queries are stressing the system by examining the query that consumes the most memory or CPU. ```sql -- Top queries by memory usage @@ -236,11 +235,10 @@ ORDER BY memory_usage DESC LIMIT 30 ``` -Let's isolate the long-running queries we found and rerun them a few times to understand the response time.  +Let's isolate the long-running queries we found and rerun them a few times to understand the response time. At this point, it is essential to turn off the filesystem cache by setting the `enable_filesystem_cache` setting to 0 to improve reproducibility. - ```sql -- Disable filesystem cache set enable_filesystem_cache = 0; @@ -301,33 +299,33 @@ Summarize in the table for easy reading. | Query 2 | 1.419 sec | 329.04 million | 546.75 MiB | | Query 3 | 1.414 sec | 329.04 million | 451.53 MiB | -Let's understand a bit better what the queries achieve.  +Let's understand a bit better what the queries achieve. -- Query 1 calculates the distance distribution in rides with an average speed of over 30 miles per hour. -- Query 2 finds the number and average cost of rides per week.  -- Query 3 calculates the average time of each trip in the dataset. +- Query 1 calculates the distance distribution in rides with an average speed of over 30 miles per hour. +- Query 2 finds the number and average cost of rides per week. +- Query 3 calculates the average time of each trip in the dataset. -None of these queries are doing very complex processing, except the first query that calculates the trip time on the fly every time the query executes. However, each of these queries takes more than one second to execute, which, in the ClickHouse world, is a very long time. We can also note the memory usage of these queries; more or less 400 Mb for each query is quite a lot of memory. Also, each query appears to read the same number of rows (i.e., 329.04 million). Let's quickly confirm how many rows are in this table. + None of these queries are doing very complex processing, except the first query that calculates the trip time on the fly every time the query executes. However, each of these queries takes more than one second to execute, which, in the ClickHouse world, is a very long time. We can also note the memory usage of these queries; more or less 400 Mb for each query is quite a lot of memory. Also, each query appears to read the same number of rows (i.e., 329.04 million). Let's quickly confirm how many rows are in this table. -```sql --- Count number of rows in table -SELECT count() -FROM nyc_taxi.trips_small_inferred + ```sql + -- Count number of rows in table + SELECT count() + FROM nyc_taxi.trips_small_inferred -Query id: 733372c5-deaf-4719-94e3-261540933b23 + Query id: 733372c5-deaf-4719-94e3-261540933b23 - ┌───count()─┐ -1. │ 329044175 │ -- 329.04 million - └───────────┘ -``` + ┌───count()─┐ + 1. │ 329044175 │ -- 329.04 million + └───────────┘ + ``` -The table contains 329.04 million rows, therefore each query is doing a full scan of the table. + The table contains 329.04 million rows, therefore each query is doing a full scan of the table. ### Explain statement {#explain-statement} Now that we have some long-running queries, let's understand how they are executed. For this, ClickHouse supports the [EXPLAIN statement command](/sql-reference/statements/explain). It is a very useful tool that provides a very detailed view of all the query execution stages without actually running the query. While it can be overwhelming to look at for a non-ClickHouse expert, it remains an essential tool for gaining insight into how your query is executed. -The documentation provides a detailed [guide](/guides/developer/understanding-query-execution-with-the-analyzer) on what the EXPLAIN statement is and how to use it to analyze your query execution. Rather than repeating what is in this guide, let's focus on a few commands that will help us find bottlenecks in query execution performance.  +The documentation provides a detailed [guide](/guides/developer/understanding-query-execution-with-the-analyzer) on what the EXPLAIN statement is and how to use it to analyze your query execution. Rather than repeating what is in this guide, let's focus on a few commands that will help us find bottlenecks in query execution performance. **Explain indexes = 1** @@ -352,58 +350,58 @@ Query id: f35c412a-edda-4089-914b-fa1622d69868 3. │ Expression (Before GROUP BY) │ 4. │ Filter (WHERE) │ 5. │ ReadFromMergeTree (nyc_taxi.trips_small_inferred) │ - └─────────────────────────────────────────────────────┘ -``` + └─────────────────────────────────────────────────────┘ + ``` -The output is straightforward. The query begins by reading data from the `nyc_taxi.trips_small_inferred` table. Then, the WHERE clause is applied to filter rows based on computed values. The filtered data is prepared for aggregation, and the quantiles are computed. Finally, the result is sorted and outputted.  + The output is straightforward. The query begins by reading data from the `nyc_taxi.trips_small_inferred` table. Then, the WHERE clause is applied to filter rows based on computed values. The filtered data is prepared for aggregation, and the quantiles are computed. Finally, the result is sorted and outputted. -Here, we can note that no primary keys are used, which makes sense as we didn't define any when we created the table. As a result, ClickHouse is doing a full scan of the table for the query.  + Here, we can note that no primary keys are used, which makes sense as we didn't define any when we created the table. As a result, ClickHouse is doing a full scan of the table for the query. -**Explain Pipeline** + **Explain Pipeline** -EXPLAIN Pipeline shows the concrete execution strategy for the query. There, you can see how ClickHouse actually executed the generic query plan we looked at previously. + EXPLAIN Pipeline shows the concrete execution strategy for the query. There, you can see how ClickHouse actually executed the generic query plan we looked at previously. -```sql -EXPLAIN PIPELINE -WITH + ```sql + EXPLAIN PIPELINE + WITH dateDiff('s', pickup_datetime, dropoff_datetime) AS trip_time, (trip_distance / trip_time) * 3600 AS speed_mph -SELECT quantiles(0.5, 0.75, 0.9, 0.99)(trip_distance) -FROM nyc_taxi.trips_small_inferred -WHERE speed_mph > 30 + SELECT quantiles(0.5, 0.75, 0.9, 0.99)(trip_distance) + FROM nyc_taxi.trips_small_inferred + WHERE speed_mph > 30 -Query id: c7e11e7b-d970-4e35-936c-ecfc24e3b879 + Query id: c7e11e7b-d970-4e35-936c-ecfc24e3b879 ┌─explain─────────────────────────────────────────────────────────────────────────────┐ - 1. │ (Expression) │ - 2. │ ExpressionTransform × 59 │ - 3. │ (Aggregating) │ - 4. │ Resize 59 → 59 │ - 5. │ AggregatingTransform × 59 │ - 6. │ StrictResize 59 → 59 │ - 7. │ (Expression) │ - 8. │ ExpressionTransform × 59 │ - 9. │ (Filter) │ + 1. │ (Expression) │ + 2. │ ExpressionTransform × 59 │ + 3. │ (Aggregating) │ + 4. │ Resize 59 → 59 │ + 5. │ AggregatingTransform × 59 │ + 6. │ StrictResize 59 → 59 │ + 7. │ (Expression) │ + 8. │ ExpressionTransform × 59 │ + 9. │ (Filter) │ 10. │ FilterTransform × 59 │ 11. │ (ReadFromMergeTree) │ 12. │ MergeTreeSelect(pool: PrefetchedReadPool, algorithm: Thread) × 59 0 → 1 │ -``` + ``` -Here, we can note the number of threads used to execute the query: 59 threads, which indicates a high parallelization. This speeds up the query, which would take longer to execute on a smaller machine. The number of threads running in parallel can explain the high volume of memory the query uses.  + Here, we can note the number of threads used to execute the query: 59 threads, which indicates a high parallelization. This speeds up the query, which would take longer to execute on a smaller machine. The number of threads running in parallel can explain the high volume of memory the query uses. -Ideally, you would investigate all your slow queries the same way to identify unnecessary complex query plans and understand the number of rows read by each query and the resources consumed. + Ideally, you would investigate all your slow queries the same way to identify unnecessary complex query plans and understand the number of rows read by each query and the resources consumed. ## Methodology {#methodology} -It can be difficult to identify problematic queries on a production deployment, as there are probably a large number of queries being executed at any given time on your ClickHouse deployment.  +It can be difficult to identify problematic queries on a production deployment, as there are probably a large number of queries being executed at any given time on your ClickHouse deployment. -If you know which user, database, or tables are having issues, you can use the fields `user`, `tables`, or `databases` from the `system.query_logs` to narrow down the search.  +If you know which user, database, or tables are having issues, you can use the fields `user`, `tables`, or `databases` from the `system.query_logs` to narrow down the search. -Once you identify the queries you want to optimize, you can start working on them to optimize. One common mistake developers make at this stage is changing multiple things simultaneously, running ad-hoc experiments, and usually ending up with mixed results, but, more importantly, missing a good understanding of what made the query faster.  +Once you identify the queries you want to optimize, you can start working on them to optimize. One common mistake developers make at this stage is changing multiple things simultaneously, running ad-hoc experiments, and usually ending up with mixed results, but, more importantly, missing a good understanding of what made the query faster. -Query optimization requires structure. I'm not talking about advanced benchmarking, but having a simple process in place to understand how your changes affect query performance can go a long way.  +Query optimization requires structure. I'm not talking about advanced benchmarking, but having a simple process in place to understand how your changes affect query performance can go a long way. -Start by identifying your slow queries from query logs, then investigate potential improvements in isolation. When testing the query, make sure you disable the filesystem cache.  +Start by identifying your slow queries from query logs, then investigate potential improvements in isolation. When testing the query, make sure you disable the filesystem cache. > ClickHouse leverages [caching](/operations/caches) to speed up query performance at different stages. This is good for query performance, but during troubleshooting, it could hide potential I/O bottlenecks or poor table schema. For this reason, I suggest turning off the filesystem cache during testing. Make sure to have it enabled in production setup. @@ -417,7 +415,7 @@ _Finally, be cautious of outliers; it's pretty common that a query might run slo Now that we have our framework to test, we can start optimizing. -The best place to start is to look at how the data is stored. As for any database, the less data we read, the faster the query will be executed.  +The best place to start is to look at how the data is stored. As for any database, the less data we read, the faster the query will be executed. Depending on how you ingested your data, you might have leveraged ClickHouse [capabilities](/interfaces/schema-inference) to infer the table schema based on the ingested data. While this is very practical to get started, if you want to optimize your query performance, you'll need to review the data schema to best fit your use case. @@ -469,7 +467,7 @@ We have only two columns with null values: `mta_tax` and `payment_type`. The res ### Low cardinality {#low-cardinality} -An easy optimization to apply to Strings is to make best use of the LowCardinality data type. As described in the low cardinality [documentation](/sql-reference/data-types/lowcardinality), ClickHouse applies dictionary coding to LowCardinality-columns, which significantly increases query performance.  +An easy optimization to apply to Strings is to make best use of the LowCardinality data type. As described in the low cardinality [documentation](/sql-reference/data-types/lowcardinality), ClickHouse applies dictionary coding to LowCardinality-columns, which significantly increases query performance. An easy rule of thumb for determining which columns are good candidates for LowCardinality is that any column with less than 10,000 unique values is a perfect candidate. @@ -499,9 +497,9 @@ With a low cardinality, those four columns, `ratecode_id`, `pickup_location_id`, ### Optimize data type {#optimize-data-type} -Clickhouse supports a large number of data types. Make sure to pick the smallest possible data type that fits your use case to optimize performance and reduce your data storage space on disk.  +Clickhouse supports a large number of data types. Make sure to pick the smallest possible data type that fits your use case to optimize performance and reduce your data storage space on disk. -For numbers, you can check the min/max value in your dataset to check if the current precision value matches the reality of your dataset.  +For numbers, you can check the min/max value in your dataset to check if the current precision value matches the reality of your dataset. ```sql -- Find min/max values for the payment_type field @@ -514,10 +512,10 @@ Query id: 4306a8e1-2a9c-4b06-97b4-4d902d2233eb ┌─min(payment_type)─┬─max(payment_type)─┐ 1. │ 1 │ 4 │ - └───────────────────┴───────────────────┘ -``` + └───────────────────┴───────────────────┘ + ``` -For dates, you should pick a precision that matches your dataset and is best suited to answering the queries you're planning to run. + For dates, you should pick a precision that matches your dataset and is best suited to answering the queries you're planning to run. ### Apply the optimizations {#apply-the-optimizations} @@ -549,7 +547,7 @@ ORDER BY tuple(); INSERT INTO trips_small_no_pk SELECT * FROM trips_small_inferred ``` -We run the queries again using the new table to check for improvement.  +We run the queries again using the new table to check for improvement. | Name | Run 1 - Elapsed | Elapsed | Rows processed | Peak memory | | ------- | --------------- | --------- | -------------- | ----------- | @@ -557,9 +555,9 @@ We run the queries again using the new table to check for improvement.  | Query 2 | 1.419 sec | 1.171 sec | 329.04 million | 531.09 MiB | | Query 3 | 1.414 sec | 1.188 sec | 329.04 million | 265.05 MiB | -We notice some improvements in both query time and memory usage. Thanks to the optimization in the data schema, we reduce the total volume of data that represents our data, leading to improved memory consumption and reduced processing time.  +We notice some improvements in both query time and memory usage. Thanks to the optimization in the data schema, we reduce the total volume of data that represents our data, leading to improved memory consumption and reduced processing time. -Let's check the size of the tables to see the difference.  +Let's check the size of the tables to see the difference. ```sql SELECT @@ -579,42 +577,42 @@ Query id: 72b5eb1c-ff33-4fdb-9d29-dd076ac6f532 ┌─table────────────────┬─compressed─┬─uncompressed─┬──────rows─┐ 1. │ trips_small_inferred │ 7.38 GiB │ 37.41 GiB │ 329044175 │ 2. │ trips_small_no_pk │ 4.89 GiB │ 15.31 GiB │ 329044175 │ - └──────────────────────┴────────────┴──────────────┴───────────┘ -``` + └──────────────────────┴────────────┴──────────────┴───────────┘ + ``` -The new table is considerably smaller than the previous one. We see a reduction of about 34% in disk space for the table (7.38 GiB vs 4.89 GiB). + The new table is considerably smaller than the previous one. We see a reduction of about 34% in disk space for the table (7.38 GiB vs 4.89 GiB). ## The importance of primary keys {#the-importance-of-primary-keys} -Primary keys in ClickHouse work differently than in most traditional database systems. In those systems, primary keys enforce uniqueness and data integrity. Any attempt to insert duplicate primary key values is rejected, and a B-tree or hash-based index is usually created for fast lookup.  +Primary keys in ClickHouse work differently than in most traditional database systems. In those systems, primary keys enforce uniqueness and data integrity. Any attempt to insert duplicate primary key values is rejected, and a B-tree or hash-based index is usually created for fast lookup. In ClickHouse, the primary key's [objective](/guides/best-practices/sparse-primary-indexes#a-table-with-a-primary-key) is different; it does not enforce uniqueness or help with data integrity. Instead, it is designed to optimize query performance. The primary key defines the order in which the data is stored on disk and is implemented as a sparse index that stores pointers to the first row of each granule. -> Granules in ClickHouse are the smallest units of data read during query execution. They contain up to a fixed number of rows, determined by index_granularity, with a default value of 8192 rows. Granules are stored contiguously and sorted by the primary key.  +> Granules in ClickHouse are the smallest units of data read during query execution. They contain up to a fixed number of rows, determined by index_granularity, with a default value of 8192 rows. Granules are stored contiguously and sorted by the primary key. -Selecting a good set of primary keys is important for performance, and it's actually common to store the same data in different tables and use different sets of primary keys to speed up a specific set of queries.  +Selecting a good set of primary keys is important for performance, and it's actually common to store the same data in different tables and use different sets of primary keys to speed up a specific set of queries. -Other options supported by ClickHouse, such as Projection or Materialized view, allow you to use a different set of primary keys on the same data. The second part of this blog series will cover this in more detail.  +Other options supported by ClickHouse, such as Projection or Materialized view, allow you to use a different set of primary keys on the same data. The second part of this blog series will cover this in more detail. ### Choose primary keys {#choose-primary-keys} -Choosing the correct set of primary keys is a complex topic, and it might require trade-offs and experiments to find the best combination.  +Choosing the correct set of primary keys is a complex topic, and it might require trade-offs and experiments to find the best combination. -For now, we're going to follow these simple practices:  +For now, we're going to follow these simple practices: -- Use fields that are used to filter in most queries -- Choose columns with lower cardinality first  -- Consider a time-based component in your primary key, as filtering by time on a timestamp dataset is pretty common.  +- Use fields that are used to filter in most queries +- Choose columns with lower cardinality first +- Consider a time-based component in your primary key, as filtering by time on a timestamp dataset is pretty common. -In our case, we will experiment with the following primary keys: `passenger_count`, `pickup_datetime`, and `dropoff_datetime`.  + In our case, we will experiment with the following primary keys: `passenger_count`, `pickup_datetime`, and `dropoff_datetime`. -The cardinality for passenger_count is small (24 unique values) and used in our slow queries. We also add timestamp fields (`pickup_datetime` and `dropoff_datetime`) as they can be filtered often. + The cardinality for passenger_count is small (24 unique values) and used in our slow queries. We also add timestamp fields (`pickup_datetime` and `dropoff_datetime`) as they can be filtered often. -Create a new table with the primary keys and re-ingest the data. + Create a new table with the primary keys and re-ingest the data. -```sql -CREATE TABLE trips_small_pk -( + ```sql + CREATE TABLE trips_small_pk + ( `vendor_id` UInt8, `pickup_datetime` DateTime, `dropoff_datetime` DateTime, @@ -630,17 +628,17 @@ CREATE TABLE trips_small_pk `tip_amount` Decimal32(2), `tolls_amount` Decimal32(2), `total_amount` Decimal32(2) -) -PRIMARY KEY (passenger_count, pickup_datetime, dropoff_datetime); + ) + PRIMARY KEY (passenger_count, pickup_datetime, dropoff_datetime); --- Insert the data -INSERT INTO trips_small_pk SELECT * FROM trips_small_inferred -``` + -- Insert the data + INSERT INTO trips_small_pk SELECT * FROM trips_small_inferred + ``` -We then rerun our queries. We compile the results from the three experiments to see the improvements in elapsed time, rows processed, and memory consumption.  + We then rerun our queries. We compile the results from the three experiments to see the improvements in elapsed time, rows processed, and memory consumption. - - +
+ @@ -650,8 +648,8 @@ We then rerun our queries. We compile the results from the three experiments to - - + + @@ -670,11 +668,11 @@ We then rerun our queries. We compile the results from the three experiments to - -
Query 1
Run 2 Run 3
Elapsed 1.699 sec337.12 MiB 444.19 MiB
+ + - - +
+ @@ -684,8 +682,8 @@ We then rerun our queries. We compile the results from the three experiments to - - + + @@ -704,11 +702,11 @@ We then rerun our queries. We compile the results from the three experiments to - -
Query 2
Run 2 Run 3
Elapsed 1.419 sec531.09 MiB 173.50 MiB
+ + - - +
+ @@ -718,8 +716,8 @@ We then rerun our queries. We compile the results from the three experiments to - - + + @@ -738,47 +736,47 @@ We then rerun our queries. We compile the results from the three experiments to - -
Query 3
Run 2 Run 3
Elapsed 1.414 sec265.05 MiB 197.38 MiB
+ + -We can see significant improvement across the board in execution time and memory used.  + We can see significant improvement across the board in execution time and memory used. -Query 2 benefits most from the primary key. Let's have a look at how the query plan generated is different from before. + Query 2 benefits most from the primary key. Let's have a look at how the query plan generated is different from before. -```sql -EXPLAIN indexes = 1 -SELECT + ```sql + EXPLAIN indexes = 1 + SELECT payment_type, COUNT() AS trip_count, formatReadableQuantity(SUM(trip_distance)) AS total_distance, AVG(total_amount) AS total_amount_avg, AVG(tip_amount) AS tip_amount_avg -FROM nyc_taxi.trips_small_pk -WHERE (pickup_datetime >= '2009-01-01') AND (pickup_datetime < '2009-04-01') -GROUP BY payment_type -ORDER BY trip_count DESC + FROM nyc_taxi.trips_small_pk + WHERE (pickup_datetime >= '2009-01-01') AND (pickup_datetime < '2009-04-01') + GROUP BY payment_type + ORDER BY trip_count DESC -Query id: 30116a77-ba86-4e9f-a9a2-a01670ad2e15 + Query id: 30116a77-ba86-4e9f-a9a2-a01670ad2e15 ┌─explain──────────────────────────────────────────────────────────────────────────────────────────────────────────┐ - 1. │ Expression ((Projection + Before ORDER BY [lifted up part])) │ - 2. │ Sorting (Sorting for ORDER BY) │ - 3. │ Expression (Before ORDER BY) │ - 4. │ Aggregating │ - 5. │ Expression (Before GROUP BY) │ - 6. │ Expression │ - 7. │ ReadFromMergeTree (nyc_taxi.trips_small_pk) │ - 8. │ Indexes: │ - 9. │ PrimaryKey │ + 1. │ Expression ((Projection + Before ORDER BY [lifted up part])) │ + 2. │ Sorting (Sorting for ORDER BY) │ + 3. │ Expression (Before ORDER BY) │ + 4. │ Aggregating │ + 5. │ Expression (Before GROUP BY) │ + 6. │ Expression │ + 7. │ ReadFromMergeTree (nyc_taxi.trips_small_pk) │ + 8. │ Indexes: │ + 9. │ PrimaryKey │ 10. │ Keys: │ 11. │ pickup_datetime │ 12. │ Condition: and((pickup_datetime in (-Inf, 1238543999]), (pickup_datetime in [1230768000, +Inf))) │ 13. │ Parts: 9/9 │ 14. │ Granules: 5061/40167 │ └──────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` + ``` -Thanks to the primary key, only a subset of the table granules has been selected. This alone greatly improves the query performance since ClickHouse has to process significantly less data. + Thanks to the primary key, only a subset of the table granules has been selected. This alone greatly improves the query performance since ClickHouse has to process significantly less data. ## Next steps {#next-steps} diff --git a/docs/guides/best-practices/query-parallelism.md b/docs/guides/best-practices/query-parallelism.md index 8347fe1a357..7652bd568f9 100644 --- a/docs/guides/best-practices/query-parallelism.md +++ b/docs/guides/best-practices/query-parallelism.md @@ -18,12 +18,10 @@ import Image from '@theme/IdealImage'; ClickHouse is [built for speed](/concepts/why-clickhouse-is-so-fast). It executes queries in a highly parallel fashion, using all available CPU cores, distributing data across processing lanes, and often pushing hardware close to its limits. - This guide walks through how query parallelism works in ClickHouse and how you can tune or monitor it to improve performance on large workloads. We use an aggregation query on the [uk_price_paid_simple](/parts) dataset to illustrate key concepts. - ## Step-by-step: How ClickHouse parallelizes an aggregation query {#step-by-step-how-clickHouse-parallelizes-an-aggregation-query} When ClickHouse ① runs an aggregation query with a filter on the table's primary key, it ② loads the primary index into memory to ③ identify which granules need to be processed, and which can be safely skipped: @@ -37,7 +35,7 @@ The selected data is then [dynamically](#load-balancing-across-processing-lanes) 4 parallel processing lanes

-The number of `n` parallel processing lanes is controlled by the [max_threads](/operations/settings/settings#max_threads) setting, which by default matches the number of CPU cores available to ClickHouse on the server. In the example above, we assume `4` cores. +The number of `n` parallel processing lanes is controlled by the [max_threads](/operations/settings/settings#max_threads) setting, which by default matches the number of CPU cores available to ClickHouse on the server. In the example above, we assume `4` cores. On a machine with `8` cores, query processing throughput would roughly double (but memory usage would also increase accordingly), as more lanes process data in parallel: @@ -84,49 +82,45 @@ SETTINGS send_logs_level='trace'; We can see that - - * ① ClickHouse needs to read 3,609 granules (indicated as marks in the trace logs) across 3 data ranges. * ② With 59 CPU cores, it distributes this work across 59 parallel processing streams—one per lane. -Alternatively, we can use the [EXPLAIN](/sql-reference/statements/explain#explain-pipeline) clause to inspect the [physical operator plan](/academic_overview#4-2-multi-core-parallelization)—also known as the "query pipeline"—for the aggregation query: -```sql runnable=false -EXPLAIN PIPELINE -SELECT - max(price) -FROM - uk.uk_price_paid_simple; -``` - -```txt - ┌─explain───────────────────────────────────────────────────────────────────────────┐ - 1. │ (Expression) │ - 2. │ ExpressionTransform × 59 │ - 3. │ (Aggregating) │ - 4. │ Resize 59 → 59 │ - 5. │ AggregatingTransform × 59 │ - 6. │ StrictResize 59 → 59 │ - 7. │ (Expression) │ - 8. │ ExpressionTransform × 59 │ - 9. │ (ReadFromMergeTree) │ -10. │ MergeTreeSelect(pool: PrefetchedReadPool, algorithm: Thread) × 59 0 → 1 │ - └───────────────────────────────────────────────────────────────────────────────────┘ -``` - -Note: Read the operator plan above from bottom to top. Each line represents a stage in the physical execution plan, starting with reading data from storage at the bottom and ending with the final processing steps at the top. Operators marked with `× 59` are executed concurrently on non-overlapping data regions across 59 parallel processing lanes. This reflects the value of `max_threads` and illustrates how each stage of the query is parallelized across CPU cores. - -ClickHouse's [embedded web UI](/interfaces/http) (available at the `/play` endpoint) can render the physical plan from above as a graphical visualization. In this example, we set `max_threads` to `4` to keep the visualization compact, showing just 4 parallel processing lanes: - -Query pipeline - -Note: Read the visualization from left to right. Each row represents a parallel processing lane that streams data block by block, applying transformations such as filtering, aggregation, and final processing stages. In this example, you can see four parallel lanes corresponding to the `max_threads = 4` setting. - + Alternatively, we can use the [EXPLAIN](/sql-reference/statements/explain#explain-pipeline) clause to inspect the [physical operator plan](/academic_overview#4-2-multi-core-parallelization)—also known as the "query pipeline"—for the aggregation query: + ```sql runnable=false + EXPLAIN PIPELINE + SELECT + max(price) + FROM + uk.uk_price_paid_simple; + ``` + + ```txt + ┌─explain───────────────────────────────────────────────────────────────────────────┐ + 1. │ (Expression) │ + 2. │ ExpressionTransform × 59 │ + 3. │ (Aggregating) │ + 4. │ Resize 59 → 59 │ + 5. │ AggregatingTransform × 59 │ + 6. │ StrictResize 59 → 59 │ + 7. │ (Expression) │ + 8. │ ExpressionTransform × 59 │ + 9. │ (ReadFromMergeTree) │ + 10. │ MergeTreeSelect(pool: PrefetchedReadPool, algorithm: Thread) × 59 0 → 1 │ + └───────────────────────────────────────────────────────────────────────────────────┘ + ``` + + Note: Read the operator plan above from bottom to top. Each line represents a stage in the physical execution plan, starting with reading data from storage at the bottom and ending with the final processing steps at the top. Operators marked with `× 59` are executed concurrently on non-overlapping data regions across 59 parallel processing lanes. This reflects the value of `max_threads` and illustrates how each stage of the query is parallelized across CPU cores. + + ClickHouse's [embedded web UI](/interfaces/http) (available at the `/play` endpoint) can render the physical plan from above as a graphical visualization. In this example, we set `max_threads` to `4` to keep the visualization compact, showing just 4 parallel processing lanes: + + Query pipeline + + Note: Read the visualization from left to right. Each row represents a parallel processing lane that streams data block by block, applying transformations such as filtering, aggregation, and final processing stages. In this example, you can see four parallel lanes corresponding to the `max_threads = 4` setting. ### Load balancing across processing lanes {#load-balancing-across-processing-lanes} Note that the `Resize` operators in the physical plan above [repartition and redistribute](/academic_overview#4-2-multi-core-parallelization) data block streams across processing lanes to keep them evenly utilized. This rebalancing is especially important when data ranges vary in how many rows match the query predicates, otherwise, some lanes may become overloaded while others sit idle. By redistributing the work, faster lanes effectively help out slower ones, optimizing overall query runtime. - ## Why max_threads isn't always respected {#why-max-threads-isnt-always-respected} As mentioned above, the number of `n` parallel processing lanes is controlled by the `max_threads` setting, which by default matches the number of CPU cores available to ClickHouse on the server: @@ -137,117 +131,116 @@ SELECT getSetting('max_threads'); ```txt ┌─getSetting('max_threads')─┐ 1. │ 59 │ - └───────────────────────────┘ -``` - -However, the `max_threads` value may be ignored depending on the amount of data selected for processing: -```sql runnable=false -EXPLAIN PIPELINE -SELECT - max(price) -FROM - uk.uk_price_paid_simple -WHERE town = 'LONDON'; -``` - -```txt -... -(ReadFromMergeTree) -MergeTreeSelect(pool: PrefetchedReadPool, algorithm: Thread) × 30 -``` - -As shown in the operator plan extract above, even though `max_threads` is set to `59`, ClickHouse uses only **30** concurrent streams to scan the data. - -Now let's run the query: -```sql runnable=false -SELECT - max(price) -FROM - uk.uk_price_paid_simple -WHERE town = 'LONDON'; -``` - -```txt - ┌─max(price)─┐ + └───────────────────────────┘ + ``` + + However, the `max_threads` value may be ignored depending on the amount of data selected for processing: + ```sql runnable=false + EXPLAIN PIPELINE + SELECT + max(price) + FROM + uk.uk_price_paid_simple + WHERE town = 'LONDON'; + ``` + + ```txt + ... + (ReadFromMergeTree) + MergeTreeSelect(pool: PrefetchedReadPool, algorithm: Thread) × 30 + ``` + + As shown in the operator plan extract above, even though `max_threads` is set to `59`, ClickHouse uses only **30** concurrent streams to scan the data. + + Now let's run the query: + ```sql runnable=false + SELECT + max(price) + FROM + uk.uk_price_paid_simple + WHERE town = 'LONDON'; + ``` + + ```txt + ┌─max(price)─┐ 1. │ 594300000 │ -- 594.30 million - └────────────┘ - -1 row in set. Elapsed: 0.013 sec. Processed 2.31 million rows, 13.66 MB (173.12 million rows/s., 1.02 GB/s.) -Peak memory usage: 27.24 MiB. -``` + └────────────┘ -As shown in the output above, the query processed 2.31 million rows and read 13.66MB of data. This is because, during the index analysis phase, ClickHouse selected **282 granules** for processing, each containing 8,192 rows, totaling approximately 2.31 million rows: + 1 row in set. Elapsed: 0.013 sec. Processed 2.31 million rows, 13.66 MB (173.12 million rows/s., 1.02 GB/s.) + Peak memory usage: 27.24 MiB. + ``` -```sql runnable=false -EXPLAIN indexes = 1 -SELECT - max(price) -FROM - uk.uk_price_paid_simple -WHERE town = 'LONDON'; -``` + As shown in the output above, the query processed 2.31 million rows and read 13.66MB of data. This is because, during the index analysis phase, ClickHouse selected **282 granules** for processing, each containing 8,192 rows, totaling approximately 2.31 million rows: -```txt + ```sql runnable=false + EXPLAIN indexes = 1 + SELECT + max(price) + FROM + uk.uk_price_paid_simple + WHERE town = 'LONDON'; + ``` + + ```txt ┌─explain───────────────────────────────────────────────┐ - 1. │ Expression ((Project names + Projection)) │ - 2. │ Aggregating │ - 3. │ Expression (Before GROUP BY) │ - 4. │ Expression │ - 5. │ ReadFromMergeTree (uk.uk_price_paid_simple) │ - 6. │ Indexes: │ - 7. │ PrimaryKey │ - 8. │ Keys: │ - 9. │ town │ + 1. │ Expression ((Project names + Projection)) │ + 2. │ Aggregating │ + 3. │ Expression (Before GROUP BY) │ + 4. │ Expression │ + 5. │ ReadFromMergeTree (uk.uk_price_paid_simple) │ + 6. │ Indexes: │ + 7. │ PrimaryKey │ + 8. │ Keys: │ + 9. │ town │ 10. │ Condition: (town in ['LONDON', 'LONDON']) │ 11. │ Parts: 3/3 │ 12. │ Granules: 282/3609 │ - └───────────────────────────────────────────────────────┘ -``` + └───────────────────────────────────────────────────────┘ + ``` -Regardless of the configured `max_threads` value, ClickHouse only allocates additional parallel processing lanes when there's enough data to justify them. The "max" in `max_threads` refers to an upper limit, not a guaranteed number of threads used. + Regardless of the configured `max_threads` value, ClickHouse only allocates additional parallel processing lanes when there's enough data to justify them. The "max" in `max_threads` refers to an upper limit, not a guaranteed number of threads used. -What "enough data" means is primarily determined by two settings, which define the minimum number of rows (163,840 by default) and the minimum number of bytes (2,097,152 by default) that each processing lane should handle: + What "enough data" means is primarily determined by two settings, which define the minimum number of rows (163,840 by default) and the minimum number of bytes (2,097,152 by default) that each processing lane should handle: -For shared-nothing clusters: + For shared-nothing clusters: * [merge_tree_min_rows_for_concurrent_read](https://clickhouse.com/docs/operations/settings/settings#merge_tree_min_rows_for_concurrent_read) * [merge_tree_min_bytes_for_concurrent_read](https://clickhouse.com/docs/operations/settings/settings#merge_tree_min_bytes_for_concurrent_read) -For clusters with shared storage (e.g. ClickHouse Cloud): + For clusters with shared storage (e.g. ClickHouse Cloud): * [merge_tree_min_rows_for_concurrent_read_for_remote_filesystem](https://clickhouse.com/docs/operations/settings/settings#merge_tree_min_rows_for_concurrent_read_for_remote_filesystem) * [merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem](https://clickhouse.com/docs/operations/settings/settings#merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem) -Additionally, there's a hard lower limit for read task size, controlled by: + Additionally, there's a hard lower limit for read task size, controlled by: * [Merge_tree_min_read_task_size](https://clickhouse.com/docs/operations/settings/settings#merge_tree_min_read_task_size) + [merge_tree_min_bytes_per_task_for_remote_reading](https://clickhouse.com/docs/operations/settings/settings#merge_tree_min_bytes_per_task_for_remote_reading) -:::warning Don't modify these settings -We don't recommend modifying these settings in production. They're shown here solely to illustrate why `max_threads` doesn't always determine the actual level of parallelism. -::: - - -For demonstration purposes, let's inspect the physical plan with these settings overridden to force maximum concurrency: -```sql runnable=false -EXPLAIN PIPELINE -SELECT - max(price) -FROM - uk.uk_price_paid_simple -WHERE town = 'LONDON' -SETTINGS - max_threads = 59, - merge_tree_min_read_task_size = 0, - merge_tree_min_rows_for_concurrent_read_for_remote_filesystem = 0, - merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem = 0; -``` - -```txt -... -(ReadFromMergeTree) -MergeTreeSelect(pool: PrefetchedReadPool, algorithm: Thread) × 59 -``` - -Now ClickHouse uses 59 concurrent streams to scan the data, fully respecting the configured `max_threads`. - -This demonstrates that for queries on small datasets, ClickHouse will intentionally limit concurrency. Use setting overrides only for testing—not in production—as they can lead to inefficient execution or resource contention. + :::warning Don't modify these settings + We don't recommend modifying these settings in production. They're shown here solely to illustrate why `max_threads` doesn't always determine the actual level of parallelism. + ::: + + For demonstration purposes, let's inspect the physical plan with these settings overridden to force maximum concurrency: + ```sql runnable=false + EXPLAIN PIPELINE + SELECT + max(price) + FROM + uk.uk_price_paid_simple + WHERE town = 'LONDON' + SETTINGS + max_threads = 59, + merge_tree_min_read_task_size = 0, + merge_tree_min_rows_for_concurrent_read_for_remote_filesystem = 0, + merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem = 0; + ``` + + ```txt + ... + (ReadFromMergeTree) + MergeTreeSelect(pool: PrefetchedReadPool, algorithm: Thread) × 59 + ``` + + Now ClickHouse uses 59 concurrent streams to scan the data, fully respecting the configured `max_threads`. + + This demonstrates that for queries on small datasets, ClickHouse will intentionally limit concurrency. Use setting overrides only for testing—not in production—as they can lead to inefficient execution or resource contention. ## Key takeaways {#key-takeaways} @@ -255,15 +248,13 @@ This demonstrates that for queries on small datasets, ClickHouse will intentiona * The actual number of lanes depends on the size of data selected for processing. * Use `EXPLAIN PIPELINE` and trace logs to analyze lane usage. - ## Where to find more information {#where-to-find-more-information} -If you'd like to dive deeper into how ClickHouse executes queries in parallel and how it achieves high performance at scale, explore the following resources: +If you'd like to dive deeper into how ClickHouse executes queries in parallel and how it achieves high performance at scale, explore the following resources: * [Query Processing Layer – VLDB 2024 Paper (Web Edition)](/academic_overview#4-query-processing-layer) - A detailed breakdown of ClickHouse's internal execution model, including scheduling, pipelining, and operator design. * [Partial aggregation states explained](https://clickhouse.com/blog/clickhouse_vs_elasticsearch_mechanics_of_count_aggregations#-multi-core-parallelization) - A technical deep dive into how partial aggregation states enable efficient parallel execution across processing lanes. * A video tutorial walking in detail through all ClickHouse query processing steps: - - + diff --git a/docs/guides/best-practices/skipping-indexes.md b/docs/guides/best-practices/skipping-indexes.md index d2488ab8ab0..b2d46ad2d9e 100644 --- a/docs/guides/best-practices/skipping-indexes.md +++ b/docs/guides/best-practices/skipping-indexes.md @@ -31,88 +31,88 @@ Users can only employ Data Skipping Indexes on the MergeTree family of tables. E - TYPE. The type of index controls the calculation that determines if it is possible to skip reading and evaluating each index block. - GRANULARITY. Each indexed block consists of GRANULARITY granules. For example, if the granularity of the primary table index is 8192 rows, and the index granularity is 4, each indexed "block" will be 32768 rows. -When a user creates a data skipping index, there will be two additional files in each data part directory for the table. + When a user creates a data skipping index, there will be two additional files in each data part directory for the table. - `skp_idx_{index_name}.idx`, which contains the ordered expression values - `skp_idx_{index_name}.mrk2`, which contains the corresponding offsets into the associated data column files. -If some portion of the WHERE clause filtering condition matches the skip index expression when executing a query and reading the relevant column files, ClickHouse will use the index file data to determine whether each relevant block of data must be processed or can be bypassed (assuming that the block has not already been excluded by applying the primary key). To use a very simplified example, consider the following table loaded with predictable data. + If some portion of the WHERE clause filtering condition matches the skip index expression when executing a query and reading the relevant column files, ClickHouse will use the index file data to determine whether each relevant block of data must be processed or can be bypassed (assuming that the block has not already been excluded by applying the primary key). To use a very simplified example, consider the following table loaded with predictable data. -```sql -CREATE TABLE skip_table -( - my_key UInt64, - my_value UInt64 -) -ENGINE MergeTree primary key my_key -SETTINGS index_granularity=8192; - -INSERT INTO skip_table SELECT number, intDiv(number,4096) FROM numbers(100000000); -``` + ```sql + CREATE TABLE skip_table + ( + my_key UInt64, + my_value UInt64 + ) + ENGINE MergeTree primary key my_key + SETTINGS index_granularity=8192; -When executing a simple query that does not use the primary key, all 100 million entries in the `my_value` -column are scanned: + INSERT INTO skip_table SELECT number, intDiv(number,4096) FROM numbers(100000000); + ``` -```sql -SELECT * FROM skip_table WHERE my_value IN (125, 700) + When executing a simple query that does not use the primary key, all 100 million entries in the `my_value` + column are scanned: -┌─my_key─┬─my_value─┐ -│ 512000 │ 125 │ -│ 512001 │ 125 │ -│ ... | ... | -└────────┴──────────┘ + ```sql + SELECT * FROM skip_table WHERE my_value IN (125, 700) -8192 rows in set. Elapsed: 0.079 sec. Processed 100.00 million rows, 800.10 MB (1.26 billion rows/s., 10.10 GB/s. -``` + ┌─my_key─┬─my_value─┐ + │ 512000 │ 125 │ + │ 512001 │ 125 │ + │ ... | ... | + └────────┴──────────┘ -Now add a very basic skip index: + 8192 rows in set. Elapsed: 0.079 sec. Processed 100.00 million rows, 800.10 MB (1.26 billion rows/s., 10.10 GB/s. + ``` -```sql -ALTER TABLE skip_table ADD INDEX vix my_value TYPE set(100) GRANULARITY 2; -``` + Now add a very basic skip index: -Normally skip indexes are only applied on newly inserted data, so just adding the index won't affect the above query. + ```sql + ALTER TABLE skip_table ADD INDEX vix my_value TYPE set(100) GRANULARITY 2; + ``` -To index already existing data, use this statement: + Normally skip indexes are only applied on newly inserted data, so just adding the index won't affect the above query. -```sql -ALTER TABLE skip_table MATERIALIZE INDEX vix; -``` + To index already existing data, use this statement: -Rerun the query with the newly created index: + ```sql + ALTER TABLE skip_table MATERIALIZE INDEX vix; + ``` -```sql -SELECT * FROM skip_table WHERE my_value IN (125, 700) + Rerun the query with the newly created index: -┌─my_key─┬─my_value─┐ -│ 512000 │ 125 │ -│ 512001 │ 125 │ -│ ... | ... | -└────────┴──────────┘ + ```sql + SELECT * FROM skip_table WHERE my_value IN (125, 700) -8192 rows in set. Elapsed: 0.051 sec. Processed 32.77 thousand rows, 360.45 KB (643.75 thousand rows/s., 7.08 MB/s.) -``` + ┌─my_key─┬─my_value─┐ + │ 512000 │ 125 │ + │ 512001 │ 125 │ + │ ... | ... | + └────────┴──────────┘ -Instead of processing 100 million rows of 800 megabytes, ClickHouse has only read and analyzed 32768 rows of 360 kilobytes --- four granules of 8192 rows each. + 8192 rows in set. Elapsed: 0.051 sec. Processed 32.77 thousand rows, 360.45 KB (643.75 thousand rows/s., 7.08 MB/s.) + ``` -In a more visual form, this is how the 4096 rows with a `my_value` of 125 were read and selected, and how the following rows -were skipped without reading from disk: + Instead of processing 100 million rows of 800 megabytes, ClickHouse has only read and analyzed 32768 rows of 360 kilobytes + -- four granules of 8192 rows each. -Simple Skip + In a more visual form, this is how the 4096 rows with a `my_value` of 125 were read and selected, and how the following rows + were skipped without reading from disk: -Users can access detailed information about skip index usage by enabling the trace when executing queries. From -clickhouse-client, set the `send_logs_level`: + Simple Skip -```sql -SET send_logs_level='trace'; -``` -This will provide useful debugging information when trying to tune query SQL and table indexes. From the above -example, the debug log shows that the skip index dropped all but two granules: + Users can access detailed information about skip index usage by enabling the trace when executing queries. From + clickhouse-client, set the `send_logs_level`: -```sql - default.skip_table (933d4b2c-8cea-4bf9-8c93-c56e900eefd1) (SelectExecutor): Index `vix` has dropped 6102/6104 granules. -``` + ```sql + SET send_logs_level='trace'; + ``` + This will provide useful debugging information when trying to tune query SQL and table indexes. From the above + example, the debug log shows that the skip index dropped all but two granules: + + ```sql + default.skip_table (933d4b2c-8cea-4bf9-8c93-c56e900eefd1) (SelectExecutor): Index `vix` has dropped 6102/6104 granules. + ``` ## Skip index types {#skip-index-types} @@ -145,13 +145,13 @@ There are three Data Skipping Index types based on Bloom filters: * The basic **bloom_filter** which takes a single optional parameter of the allowed "false positive" rate between 0 and 1 (if unspecified, .025 is used). * The specialized **tokenbf_v1**. It takes three parameters, all related to tuning the bloom filter used: (1) the size of the filter in bytes (larger filters have fewer false positives, at some cost in storage), (2) number of hash functions applied (again, more hash filters reduce false positives), and (3) the seed for the bloom filter hash functions. See the calculator [here](https://hur.st/bloomfilter/) for more detail on how these parameters affect bloom filter functionality. -This index works only with String, FixedString, and Map datatypes. The input expression is split into character sequences separated by non-alphanumeric characters. For example, a column value of `This is a candidate for a "full text" search` will contain the tokens `This` `is` `a` `candidate` `for` `full` `text` `search`. It is intended for use in LIKE, EQUALS, IN, hasToken() and similar searches for words and other values within longer strings. For example, one possible use might be searching for a small number of class names or line numbers in a column of free form application log lines. + This index works only with String, FixedString, and Map datatypes. The input expression is split into character sequences separated by non-alphanumeric characters. For example, a column value of `This is a candidate for a "full text" search` will contain the tokens `This` `is` `a` `candidate` `for` `full` `text` `search`. It is intended for use in LIKE, EQUALS, IN, hasToken() and similar searches for words and other values within longer strings. For example, one possible use might be searching for a small number of class names or line numbers in a column of free form application log lines. * The specialized **ngrambf_v1**. This index functions the same as the token index. It takes one additional parameter before the Bloom filter settings, the size of the ngrams to index. An ngram is a character string of length `n` of any characters, so the string `A short string` with an ngram size of 4 would be indexed as: - ```text - 'A sh', ' sho', 'shor', 'hort', 'ort ', 'rt s', 't st', ' str', 'stri', 'trin', 'ring' - ``` -This index can also be useful for text searches, particularly languages without word breaks, such as Chinese. + ```text + 'A sh', ' sho', 'shor', 'hort', 'ort ', 'rt s', 't st', ' str', 'stri', 'trin', 'ring' + ``` + This index can also be useful for text searches, particularly languages without word breaks, such as Chinese. ## Skip index functions {#skip-index-functions} @@ -159,20 +159,20 @@ The core purpose of data-skipping indexes is to limit the amount of data analyze * data is inserted and the index is defined as a functional expression (with the result of the expression stored in the index files), or * the query is processed and the expression is applied to the stored index values to determine whether to exclude the block. -Each type of skip index works on a subset of available ClickHouse functions appropriate to the index implementation listed -[here](/engines/table-engines/mergetree-family/mergetree/#functions-support). In general, set indexes and Bloom filter based indexes (another type of set index) are both unordered and therefore do not work with ranges. In contrast, minmax indexes work particularly well with ranges since determining whether ranges intersect is very fast. The efficacy of partial match functions LIKE, startsWith, endsWith, and hasToken depend on the index type used, the index expression, and the particular shape of the data. + Each type of skip index works on a subset of available ClickHouse functions appropriate to the index implementation listed + [here](/engines/table-engines/mergetree-family/mergetree/#functions-support). In general, set indexes and Bloom filter based indexes (another type of set index) are both unordered and therefore do not work with ranges. In contrast, minmax indexes work particularly well with ranges since determining whether ranges intersect is very fast. The efficacy of partial match functions LIKE, startsWith, endsWith, and hasToken depend on the index type used, the index expression, and the particular shape of the data. ## Skip index settings {#skip-index-settings} There are two available settings that apply to skip indexes. * **use_skip_indexes** (0 or 1, default 1). Not all queries can efficiently use skip indexes. If a particular filtering condition is -likely to include most granules, applying the data skipping index incurs an unnecessary, and sometimes significant, cost. Set the value to -0 for queries that are unlikely to benefit from any skip indexes. + likely to include most granules, applying the data skipping index incurs an unnecessary, and sometimes significant, cost. Set the value to + 0 for queries that are unlikely to benefit from any skip indexes. * **force_data_skipping_indices** (comma separated list of index names). This setting can be used to prevent some kinds of inefficient -queries. In circumstances where querying a table is too expensive unless a skip index is used, using this setting with one or more index -names will return an exception for any query that does not use the listed index. This would prevent poorly written queries from -consuming server resources. + queries. In circumstances where querying a table is too expensive unless a skip index is used, using this setting with one or more index + names will return an exception for any query that does not use the listed index. This would prevent poorly written queries from + consuming server resources. ## Skip index best practices {#skip-best-practices} diff --git a/docs/guides/best-practices/sparse-primary-indexes.md b/docs/guides/best-practices/sparse-primary-indexes.md index dc65f34fc8c..005848645e1 100644 --- a/docs/guides/best-practices/sparse-primary-indexes.md +++ b/docs/guides/best-practices/sparse-primary-indexes.md @@ -42,15 +42,14 @@ In this guide we are going to do a deep dive into ClickHouse indexing. We will i - [how ClickHouse is building and using a table's sparse primary index](#a-table-with-a-primary-key) - [what some of the best practices are for indexing in ClickHouse](#using-multiple-primary-indexes) -You can optionally execute all ClickHouse SQL statements and queries given in this guide by yourself on your own machine. -For installation of ClickHouse and getting started instructions, see the [Quick Start](/get-started/quick-start). + You can optionally execute all ClickHouse SQL statements and queries given in this guide by yourself on your own machine. + For installation of ClickHouse and getting started instructions, see the [Quick Start](/get-started/quick-start). -:::note -This guide is focusing on ClickHouse sparse primary indexes. - -For ClickHouse [secondary data skipping indexes](/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-data_skipping-indexes), see the [Tutorial](/guides/best-practices/skipping-indexes.md). -::: + :::note + This guide is focusing on ClickHouse sparse primary indexes. + For ClickHouse [secondary data skipping indexes](/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-data_skipping-indexes), see the [Tutorial](/guides/best-practices/skipping-indexes.md). + ::: ### Data set {#data-set} @@ -60,7 +59,7 @@ Throughout this guide we will use a sample anonymized web traffic data set. - The uncompressed data size is 8.87 million events and about 700 MB. This compresses to 200 mb when stored in ClickHouse. - In our subset, each row contains three columns that indicate an internet user (`UserID` column) who clicked on a URL (`URL` column) at a specific time (`EventTime` column). -With these three columns we can already formulate some typical web analytics queries such as: + With these three columns we can already formulate some typical web analytics queries such as: - "What are the top 10 most clicked urls for a specific user?" - "What are the top 10 users that most frequently clicked a specific URL?" @@ -70,7 +69,6 @@ With these three columns we can already formulate some typical web analytics que All runtime numbers given in this document are based on running ClickHouse 22.2.1 locally on a MacBook Pro with the Apple M1 Pro chip and 16GB of RAM. - ### A full table scan {#a-full-table-scan} In order to see how a query is executed over our data set without a primary key, we create a table (with a MergeTree table engine) by executing the following SQL DDL statement: @@ -86,8 +84,6 @@ ENGINE = MergeTree PRIMARY KEY tuple(); ``` - - Next insert a subset of the hits data set into the table with the following SQL insert statement. This uses the [URL table function](/sql-reference/table-functions/url.md) in order to load a subset of the full dataset hosted remotely at clickhouse.com: @@ -106,10 +102,8 @@ Ok. 0 rows in set. Elapsed: 145.993 sec. Processed 8.87 million rows, 18.40 GB (60.78 thousand rows/s., 126.06 MB/s.) ``` - ClickHouse client's result output shows us that the statement above inserted 8.87 million rows into the table. - Lastly, in order to simplify the discussions later on in this guide and to make the diagrams and results reproducible, we [optimize](/sql-reference/statements/optimize.md) the table using the FINAL keyword: ```sql @@ -121,7 +115,6 @@ In general it is not required nor recommended to immediately optimize a table after loading data into it. Why this is necessary for this example will become apparent. ::: - Now we execute our first web analytics query. The following is calculating the top 10 most clicked urls for the internet user with the UserID 749927693: ```sql @@ -189,45 +182,41 @@ SETTINGS index_granularity = 8192, index_granularity_bytes = 0, compress_primary [//]: # (
)
- - DDL Statement Details - -

- +

+DDL Statement Details + +

In order to simplify the discussions later on in this guide, as well as make the diagrams and results reproducible, the DDL statement: -

    -
  • - Specifies a compound sorting key for the table via an ORDER BY clause. -
  • -
  • - Explicitly controls how many index entries the primary index will have through the settings: -
      -
    • - index_granularity: explicitly set to its default value of 8192. This means that for each group of 8192 rows, the primary index will have one index entry. For example, if the table contains 16384 rows, the index will have two index entries. -
    • -
    • - index_granularity_bytes: set to 0 in order to disable adaptive index granularity. Adaptive index granularity means that ClickHouse automatically creates one index entry for a group of n rows if either of these are true: -
        -
      • - If n is less than 8192 and the size of the combined row data for that n rows is larger than or equal to 10 MB (the default value for index_granularity_bytes). -
      • -
      • - If the combined row data size for n rows is less than 10 MB but n is 8192. -
      • -
      -
    • -
    • - compress_primary_key: set to 0 to disable compression of the primary index. This will allow us to optionally inspect its contents later. -
    • -
    -
  • +
  • +Specifies a compound sorting key for the table via an ORDER BY clause. +
  • +
  • +Explicitly controls how many index entries the primary index will have through the settings: +
      +
    • +index_granularity: explicitly set to its default value of 8192. This means that for each group of 8192 rows, the primary index will have one index entry. For example, if the table contains 16384 rows, the index will have two index entries. +
    • +
    • +index_granularity_bytes: set to 0 in order to disable adaptive index granularity. Adaptive index granularity means that ClickHouse automatically creates one index entry for a group of n rows if either of these are true: +
        +
      • +If n is less than 8192 and the size of the combined row data for that n rows is larger than or equal to 10 MB (the default value for index_granularity_bytes). +
      • +
      • +If the combined row data size for n rows is less than 10 MB but n is 8192. +
      • +
      +
    • +
    • +compress_primary_key: set to 0 to disable compression of the primary index. This will allow us to optionally inspect its contents later. +
    • +
    +
-

- The primary key in the DDL statement above causes the creation of the primary index based on the two specified key columns.
@@ -246,7 +235,6 @@ The response looks like: 0 rows in set. Elapsed: 149.432 sec. Processed 8.87 million rows, 18.40 GB (59.38 thousand rows/s., 123.16 MB/s.) ``` -
And optimize the table: @@ -284,7 +272,6 @@ primary_key_bytes_in_memory: 96.93 KiB marks: 1083 bytes_on_disk: 207.07 MiB - 1 rows in set. Elapsed: 0.003 sec. ``` @@ -303,7 +290,7 @@ Our table that we created above has - a compound [primary key](/engines/table-engines/mergetree-family/mergetree.md/#primary-keys-and-indexes-in-queries) `(UserID, URL)` and - a compound [sorting key](/engines/table-engines/mergetree-family/mergetree.md/#choosing-a-primary-key-that-differs-from-the-sorting-key) `(UserID, URL, EventTime)`. -:::note + :::note - If we would have specified only the sorting key, then the primary key would be implicitly defined to be equal to the sorting key. - In order to be memory efficient we explicitly specified a primary key that only contains columns that our queries are filtering on. The primary index that is based on the primary key is completely loaded into the main memory. @@ -311,34 +298,31 @@ Our table that we created above has - In order to have consistency in the guide's diagrams and in order to maximise compression ratio we defined a separate sorting key that includes all of our table's columns (if in a column similar data is placed close to each other, for example via sorting, then that data will be compressed better). - The primary key needs to be a prefix of the sorting key if both are specified. -::: + ::: -The inserted rows are stored on disk in lexicographical order (ascending) by the primary key columns (and the additional `EventTime` column from the sorting key). - -:::note -ClickHouse allows inserting multiple rows with identical primary key column values. In this case (see row 1 and row 2 in the diagram below), the final order is determined by the specified sorting key and therefore the value of the `EventTime` column. -::: + The inserted rows are stored on disk in lexicographical order (ascending) by the primary key columns (and the additional `EventTime` column from the sorting key). + :::note + ClickHouse allows inserting multiple rows with identical primary key column values. In this case (see row 1 and row 2 in the diagram below), the final order is determined by the specified sorting key and therefore the value of the `EventTime` column. + ::: -ClickHouse is a column-oriented database management system. As shown in the diagram below + ClickHouse is a column-oriented database management system. As shown in the diagram below - for the on disk representation, there is a single data file (*.bin) per table column where all the values for that column are stored in a compressed format, and - the 8.87 million rows are stored on disk in lexicographic ascending order by the primary key columns (and the additional sort key columns) i.e. in this case - - first by `UserID`, - - then by `URL`, - - and lastly by `EventTime`: + - first by `UserID`, + - then by `URL`, + - and lastly by `EventTime`: -Sparse Primary Indices 01 + Sparse Primary Indices 01 -`UserID.bin`, `URL.bin`, and `EventTime.bin` are the data files on disk where the values of the `UserID`, `URL`, and `EventTime` columns are stored. + `UserID.bin`, `URL.bin`, and `EventTime.bin` are the data files on disk where the values of the `UserID`, `URL`, and `EventTime` columns are stored. -:::note + :::note - As the primary key defines the lexicographical order of the rows on disk, a table can only have one primary key. - We are numbering rows starting with 0 in order to be aligned with the ClickHouse internal row numbering scheme that is also used for logging messages. -::: - - + ::: ### Data is organized into granules for parallel data processing {#data-is-organized-into-granules-for-parallel-data-processing} @@ -361,17 +345,16 @@ The first (based on physical order on disk) 8192 rows (their column values) logi - We mentioned in the beginning of this guide in the "DDL Statement Details", that we disabled [adaptive index granularity](/whats-new/changelog/2019.md/#experimental-features-1) (in order to simplify the discussions in this guide, as well as make the diagrams and results reproducible). - Therefore all granules (except the last one) of our example table have the same size. + Therefore all granules (except the last one) of our example table have the same size. - For tables with adaptive index granularity (index granularity is adaptive by [default](/operations/settings/merge-tree-settings#index_granularity_bytes) the size of some granules can be less than 8192 rows depending on the row data sizes. - - We marked some column values from our primary key columns (`UserID`, `URL`) in orange. - These orange-marked column values are the primary key column values of each first row of each granule. - As we will see below, these orange-marked column values will be the entries in the table's primary index. + These orange-marked column values are the primary key column values of each first row of each granule. + As we will see below, these orange-marked column values will be the entries in the table's primary index. - We are numbering granules starting with 0 in order to be aligned with the ClickHouse internal numbering scheme that is also used for logging messages. -::: + ::: ### The primary index has one entry per granule {#the-primary-index-has-one-entry-per-granule} @@ -383,51 +366,39 @@ For example - the first index entry ('mark 0' in the diagram below) is storing the key column values of the first row of granule 0 from the diagram above, - the second index entry ('mark 1' in the diagram below) is storing the key column values of the first row of granule 1 from the diagram above, and so on. -Sparse Primary Indices 03a + Sparse Primary Indices 03a -In total the index has 1083 entries for our table with 8.87 million rows and 1083 granules: + In total the index has 1083 entries for our table with 8.87 million rows and 1083 granules: -Sparse Primary Indices 03b + Sparse Primary Indices 03b -:::note + :::note - For tables with [adaptive index granularity](/whats-new/changelog/2019.md/#experimental-features-1), there is also one "final" additional mark stored in the primary index that records the values of the primary key columns of the last table row, but because we disabled adaptive index granularity (in order to simplify the discussions in this guide, as well as make the diagrams and results reproducible), the index of our example table doesn't include this final mark. - The primary index file is completely loaded into the main memory. If the file is larger than the available free memory space then ClickHouse will raise an error. -::: + :::
- - Inspecting the content of the primary index - -

- +

+Inspecting the content of the primary index + +

On a self-managed ClickHouse cluster we can use the file table function for inspecting the content of the primary index of our example table. - For that we first need to copy the primary index file into the user_files_path of a node from the running cluster:

  • Step 1: Get part-path that contains the primary index file
  • ` SELECT path FROM system.parts WHERE table = 'hits_UserID_URL' AND active = 1 ` - returns `/Users/tomschreiber/Clickhouse/store/85f/85f4ee68-6e28-4f08-98b1-7d8affa1d88c/all_1_9_4` on the test machine. -
  • Step 2: Get user_files_path
  • The default user_files_path on Linux is `/var/lib/clickhouse/user_files/` - and on Linux you can check if it got changed: `$ grep user_files_path /etc/clickhouse-server/config.xml` - On the test machine the path is `/Users/tomschreiber/Clickhouse/user_files/` - -
  • Step 3: Copy the primary index file into the user_files_path
  • - `cp /Users/tomschreiber/Clickhouse/store/85f/85f4ee68-6e28-4f08-98b1-7d8affa1d88c/all_1_9_4/primary.idx /Users/tomschreiber/Clickhouse/user_files/primary-hits_UserID_URL.idx` -
- -
Now we can inspect the content of the primary index via SQL:
  • Get amount of entries
  • @@ -435,19 +406,15 @@ Now we can inspect the content of the primary index via SQL: SELECT count( )
    FROM file('primary-hits_UserID_URL.idx', 'RowBinary', 'UserID UInt32, URL String'); ` returns `1083` -
  • Get first two index marks
  • ` SELECT UserID, URL
    FROM file('primary-hits_UserID_URL.idx', 'RowBinary', 'UserID UInt32, URL String')
    LIMIT 0, 2; ` - returns - ` 240923, http://showtopics.html%3...
    4073710, http://mk.ru&pos=3_0 ` -
  • Get last index mark
  • ` SELECT UserID, URL FROM file('primary-hits_UserID_URL.idx', 'RowBinary', 'UserID UInt32, URL String')
    LIMIT 1082, 1; @@ -457,36 +424,30 @@ returns 4292714039 │ http://sosyal-mansetleri... `
-
This matches exactly our diagram of the primary index content for our example table: - -

- - -The primary key entries are called index marks because each index entry is marking the start of a specific data range. Specifically for the example table: + The primary key entries are called index marks because each index entry is marking the start of a specific data range. Specifically for the example table: - UserID index marks: - The stored `UserID` values in the primary index are sorted in ascending order.
- 'mark 1' in the diagram above thus indicates that the `UserID` values of all table rows in granule 1, and in all following granules, are guaranteed to be greater than or equal to 4.073.710. + The stored `UserID` values in the primary index are sorted in ascending order.
+ 'mark 1' in the diagram above thus indicates that the `UserID` values of all table rows in granule 1, and in all following granules, are guaranteed to be greater than or equal to 4.073.710. - [As we will see later](#the-primary-index-is-used-for-selecting-granules), this global order enables ClickHouse to use a binary search algorithm over the index marks for the first key column when a query is filtering on the first column of the primary key. + [As we will see later](#the-primary-index-is-used-for-selecting-granules), this global order enables ClickHouse to use a binary search algorithm over the index marks for the first key column when a query is filtering on the first column of the primary key. - URL index marks: - The quite similar cardinality of the primary key columns `UserID` and `URL` - means that the index marks for all key columns after the first column in general only indicate a data range as long as the predecessor key column value stays the same for all table rows within at least the current granule.
- For example, because the UserID values of mark 0 and mark 1 are different in the diagram above, ClickHouse can't assume that all URL values of all table rows in granule 0 are larger or equal to `'http://showtopics.html%3...'`. However, if the UserID values of mark 0 and mark 1 would be the same in the diagram above (meaning that the UserID value stays the same for all table rows within the granule 0), the ClickHouse could assume that all URL values of all table rows in granule 0 are larger or equal to `'http://showtopics.html%3...'`. + The quite similar cardinality of the primary key columns `UserID` and `URL` + means that the index marks for all key columns after the first column in general only indicate a data range as long as the predecessor key column value stays the same for all table rows within at least the current granule.
+ For example, because the UserID values of mark 0 and mark 1 are different in the diagram above, ClickHouse can't assume that all URL values of all table rows in granule 0 are larger or equal to `'http://showtopics.html%3...'`. However, if the UserID values of mark 0 and mark 1 would be the same in the diagram above (meaning that the UserID value stays the same for all table rows within the granule 0), the ClickHouse could assume that all URL values of all table rows in granule 0 are larger or equal to `'http://showtopics.html%3...'`. - We will discuss the consequences of this on query execution performance in more detail later. + We will discuss the consequences of this on query execution performance in more detail later. ### The primary index is used for selecting granules {#the-primary-index-is-used-for-selecting-granules} We can now execute our queries with support from the primary index. - The following calculates the top 10 most clicked urls for the UserID 749927693. ```sql @@ -500,7 +461,6 @@ LIMIT 10; The response is: - ```response ┌─URL────────────────────────────┬─Count─┐ │ http://auto.ru/chatay-barana.. │ 170 │ @@ -523,7 +483,6 @@ Processed 8.19 thousand rows, The output for the ClickHouse client is now showing that instead of doing a full table scan, only 8.19 thousand rows were streamed into ClickHouse. - If trace logging is enabled then the ClickHouse server log file shows that ClickHouse was running a binary search over the 1083 UserID index marks, in order to identify granules that possibly can contain rows with a UserID column value of `749927693`. This requires 19 steps with an average time complexity of `O(log2 n)`: ```response ...Executor): Key condition: (column 0 in [749927693, 749927693]) @@ -538,15 +497,13 @@ If + The following diagram and the text below illustrate how for our example query ClickHouse locates granule 176 in the UserID.bin data file. -We discussed earlier in this guide that ClickHouse selected the primary index mark 176 and therefore granule 176 as possibly containing matching rows for our query. + Sparse Primary Indices 06 -ClickHouse now uses the selected mark number (176) from the index for a positional array lookup in the UserID.mrk mark file in order to get the two offsets for locating granule 176. + We discussed earlier in this guide that ClickHouse selected the primary index mark 176 and therefore granule 176 as possibly containing matching rows for our query. -As shown, the first offset is locating the compressed file block within the UserID.bin data file that in turn contains the compressed version of granule 176. + ClickHouse now uses the selected mark number (176) from the index for a positional array lookup in the UserID.mrk mark file in order to get the two offsets for locating granule 176. -Once the located file block is uncompressed into the main memory, the second offset from the mark file can be used to locate granule 176 within the uncompressed data. + As shown, the first offset is locating the compressed file block within the UserID.bin data file that in turn contains the compressed version of granule 176. -ClickHouse needs to locate (and stream all values from) granule 176 from both the UserID.bin data file and the URL.bin data file in order to execute our example query (top 10 most clicked URLs for the internet user with the UserID 749.927.693). + Once the located file block is uncompressed into the main memory, the second offset from the mark file can be used to locate granule 176 within the uncompressed data. -The diagram above shows how ClickHouse is locating the granule for the UserID.bin data file. + ClickHouse needs to locate (and stream all values from) granule 176 from both the UserID.bin data file and the URL.bin data file in order to execute our example query (top 10 most clicked URLs for the internet user with the UserID 749.927.693). -In parallel, ClickHouse is doing the same for granule 176 for the URL.bin data file. The two respective granules are aligned and streamed into the ClickHouse engine for further processing i.e. aggregating and counting the URL values per group for all rows where the UserID is 749.927.693, before finally outputting the 10 largest URL groups in descending count order. + The diagram above shows how ClickHouse is locating the granule for the UserID.bin data file. + In parallel, ClickHouse is doing the same for granule 176 for the URL.bin data file. The two respective granules are aligned and streamed into the ClickHouse engine for further processing i.e. aggregating and counting the URL values per group for all rows where the UserID is 749.927.693, before finally outputting the 10 largest URL groups in descending count order. ## Using multiple primary indexes {#using-multiple-primary-indexes} @@ -708,7 +656,6 @@ In parallel, ClickHouse is doing the same for granule 176 for the URL.bin data f ### Secondary key columns can (not) be inefficient {#secondary-key-columns-can-not-be-inefficient} - When a query is filtering on a column that is part of a compound key and is the first key column, [then ClickHouse is running the binary search algorithm over the key column's index marks](#the-primary-index-is-used-for-selecting-granules). But what happens when a query is filtering on a column that is part of a compound key, but is not the first key column? @@ -793,50 +740,46 @@ As an example for both cases we will assume: - the same compound primary key (UserID, URL) for the index. This means rows are first ordered by UserID values. Rows with the same UserID value are then ordered by URL. - a granule size of two i.e. each granule contains two rows. -We have marked the key column values for the first table rows for each granule in orange in the diagrams below.. + We have marked the key column values for the first table rows for each granule in orange in the diagrams below.. -**Predecessor key column has low(er) cardinality** + **Predecessor key column has low(er) cardinality** -Suppose UserID had low cardinality. In this case it would be likely that the same UserID value is spread over multiple table rows and granules and therefore index marks. For index marks with the same UserID, the URL values for the index marks are sorted in ascending order (because the table rows are ordered first by UserID and then by URL). This allows efficient filtering as described below: + Suppose UserID had low cardinality. In this case it would be likely that the same UserID value is spread over multiple table rows and granules and therefore index marks. For index marks with the same UserID, the URL values for the index marks are sorted in ascending order (because the table rows are ordered first by UserID and then by URL). This allows efficient filtering as described below: -Sparse Primary Indices 06 + Sparse Primary Indices 06 -There are three different scenarios for the granule selection process for our abstract sample data in the diagram above: + There are three different scenarios for the granule selection process for our abstract sample data in the diagram above: -1. Index mark 0 for which the **URL value is smaller than W3 and for which the URL value of the directly succeeding index mark is also smaller than W3** can be excluded because mark 0, and 1 have the same UserID value. Note that this exclusion-precondition ensures that granule 0 is completely composed of U1 UserID values so that ClickHouse can assume that also the maximum URL value in granule 0 is smaller than W3 and exclude the granule. +1. Index mark 0 for which the **URL value is smaller than W3 and for which the URL value of the directly succeeding index mark is also smaller than W3** can be excluded because mark 0, and 1 have the same UserID value. Note that this exclusion-precondition ensures that granule 0 is completely composed of U1 UserID values so that ClickHouse can assume that also the maximum URL value in granule 0 is smaller than W3 and exclude the granule. 2. Index mark 1 for which the **URL value is smaller (or equal) than W3 and for which the URL value of the directly succeeding index mark is greater (or equal) than W3** is selected because it means that granule 1 can possibly contain rows with URL W3. 3. Index marks 2 and 3 for which the **URL value is greater than W3** can be excluded, since index marks of a primary index store the key column values for the first table row for each granule and the table rows are sorted on disk by the key column values, therefore granule 2 and 3 can't possibly contain URL value W3. -**Predecessor key column has high(er) cardinality** + **Predecessor key column has high(er) cardinality** -When the UserID has high cardinality then it is unlikely that the same UserID value is spread over multiple table rows and granules. This means the URL values for the index marks are not monotonically increasing: + When the UserID has high cardinality then it is unlikely that the same UserID value is spread over multiple table rows and granules. This means the URL values for the index marks are not monotonically increasing: -Sparse Primary Indices 06 + Sparse Primary Indices 06 -As we can see in the diagram above, all shown marks whose URL values are smaller than W3 are getting selected for streaming its associated granule's rows into the ClickHouse engine. + As we can see in the diagram above, all shown marks whose URL values are smaller than W3 are getting selected for streaming its associated granule's rows into the ClickHouse engine. -This is because whilst all index marks in the diagram fall into scenario 1 described above, they do not satisfy the mentioned exclusion-precondition that *the directly succeeding index mark has the same UserID value as the current mark* and thus can't be excluded. + This is because whilst all index marks in the diagram fall into scenario 1 described above, they do not satisfy the mentioned exclusion-precondition that *the directly succeeding index mark has the same UserID value as the current mark* and thus can't be excluded. -For example, consider index mark 0 for which the **URL value is smaller than W3 and for which the URL value of the directly succeeding index mark is also smaller than W3**. This can *not* be excluded because the directly succeeding index mark 1 does *not* have the same UserID value as the current mark 0. + For example, consider index mark 0 for which the **URL value is smaller than W3 and for which the URL value of the directly succeeding index mark is also smaller than W3**. This can *not* be excluded because the directly succeeding index mark 1 does *not* have the same UserID value as the current mark 0. -This ultimately prevents ClickHouse from making assumptions about the maximum URL value in granule 0. Instead it has to assume that granule 0 potentially contains rows with URL value W3 and is forced to select mark 0. + This ultimately prevents ClickHouse from making assumptions about the maximum URL value in granule 0. Instead it has to assume that granule 0 potentially contains rows with URL value W3 and is forced to select mark 0. + The same scenario is true for mark 1, 2, and 3. -The same scenario is true for mark 1, 2, and 3. - - -:::note Conclusion -The generic exclusion search algorithm that ClickHouse is using instead of the binary search algorithm when a query is filtering on a column that is part of a compound key, but is not the first key column is most effective when the predecessor key column has low(er) cardinality. -::: - -In our sample data set both key columns (UserID, URL) have similar high cardinality, and, as explained, the generic exclusion search algorithm is not very effective when the predecessor key column of the URL column has a high(er) or similar cardinality. + :::note Conclusion + The generic exclusion search algorithm that ClickHouse is using instead of the binary search algorithm when a query is filtering on a column that is part of a compound key, but is not the first key column is most effective when the predecessor key column has low(er) cardinality. + ::: + In our sample data set both key columns (UserID, URL) have similar high cardinality, and, as explained, the generic exclusion search algorithm is not very effective when the predecessor key column of the URL column has a high(er) or similar cardinality. ### Note about data skipping index {#note-about-data-skipping-index} - Because of the similarly high cardinality of UserID and URL, our [query filtering on URL](/guides/best-practices/sparse-primary-indexes#secondary-key-columns-can-not-be-inefficient) also wouldn't benefit much from creating a [secondary data skipping index](./skipping-indexes.md) on the URL column of our [table with compound primary key (UserID, URL)](#a-table-with-a-primary-key). @@ -855,15 +798,12 @@ The second index entry ('mark 1') is storing the minimum and maximum URL values (ClickHouse also created a special [mark file](#mark-files-are-used-for-locating-granules) for to the data skipping index for [locating](#mark-files-are-used-for-locating-granules) the groups of granules associated with the index marks.) - Because of the similarly high cardinality of UserID and URL, this secondary data skipping index can't help with excluding granules from being selected when our [query filtering on URL](/guides/best-practices/sparse-primary-indexes#secondary-key-columns-can-not-be-inefficient) is executed. The specific URL value that the query is looking for (i.e. 'http://public_search') very likely is between the minimum and maximum value stored by the index for each group of granules resulting in ClickHouse being forced to select the group of granules (because they might contain row(s) matching the query). - ### A need to use multiple primary indexes {#a-need-to-use-multiple-primary-indexes} - As a consequence, if we want to significantly speed up our sample query that filters for rows with a specific URL then we need to use a primary index optimized to that query. If in addition we want to keep the good performance of our sample query that filters for rows with a specific UserID then we need to use multiple primary indexes. @@ -874,32 +814,31 @@ The following is showing ways for achieving that. ### Options for creating additional primary indexes {#options-for-creating-additional-primary-indexes} - If we want to significantly speed up both of our sample queries - the one that filters for rows with a specific UserID and the one that filters for rows with a specific URL - then we need to use multiple primary indexes by using one of these three options: - Creating a **second table** with a different primary key. - Creating a **materialized view** on our existing table. - Adding a **projection** to our existing table. -All three options will effectively duplicate our sample data into a additional table in order to reorganize the table primary index and row sort order. + All three options will effectively duplicate our sample data into a additional table in order to reorganize the table primary index and row sort order. -However, the three options differ in how transparent that additional table is to the user with respect to the routing of queries and insert statements. + However, the three options differ in how transparent that additional table is to the user with respect to the routing of queries and insert statements. -When creating a **second table** with a different primary key then queries must be explicitly send to the table version best suited for the query, and new data must be inserted explicitly into both tables in order to keep the tables in sync: + When creating a **second table** with a different primary key then queries must be explicitly send to the table version best suited for the query, and new data must be inserted explicitly into both tables in order to keep the tables in sync: -Sparse Primary Indices 09a + Sparse Primary Indices 09a -With a **materialized view** the additional table is implicitly created and data is automatically kept in sync between both tables: + With a **materialized view** the additional table is implicitly created and data is automatically kept in sync between both tables: -Sparse Primary Indices 09b + Sparse Primary Indices 09b -And the **projection** is the most transparent option because next to automatically keeping the implicitly created (and hidden) additional table in sync with data changes, ClickHouse will automatically choose the most effective table version for queries: + And the **projection** is the most transparent option because next to automatically keeping the implicitly created (and hidden) additional table in sync with data changes, ClickHouse will automatically choose the most effective table version for queries: -Sparse Primary Indices 09c + Sparse Primary Indices 09c -In the following we discuss this three options for creating and using multiple primary indexes in more detail and with real examples. + In the following we discuss this three options for creating and using multiple primary indexes in more detail and with real examples. - + ### Option 1: Secondary Tables {#option-1-secondary-tables} @@ -1003,19 +942,16 @@ The corresponding trace log in the ClickHouse server log file confirms that: ``` ClickHouse selected only 39 index marks, instead of 1076 when generic exclusion search was used. - Note that the additional table is optimized for speeding up the execution of our example query filtering on URLs. - Similar to the [bad performance](/guides/best-practices/sparse-primary-indexes#secondary-key-columns-can-not-be-inefficient) of that query with our [original table](#a-table-with-a-primary-key), our [example query filtering on `UserIDs`](#the-primary-index-is-used-for-selecting-granules) will not run very effectively with the new additional table, because UserID is now the second key column in the primary index of that table and therefore ClickHouse will use generic exclusion search for granule selection, which is [not very effective for similarly high cardinality](/guides/best-practices/sparse-primary-indexes#generic-exclusion-search-algorithm) of UserID and URL. Open the details box for specifics.
- - Query filtering on UserIDs now has bad performance - -

- +

+Query filtering on UserIDs now has bad performance + +

```sql SELECT URL, count(URL) AS Count FROM hits_URL_UserID @@ -1024,9 +960,7 @@ GROUP BY URL ORDER BY Count DESC LIMIT 10; ``` - The response is: - ```response ┌─URL────────────────────────────┬─Count─┐ │ http://auto.ru/chatay-barana.. │ 170 │ @@ -1040,31 +974,27 @@ The response is: │ http://auto.ru/chatay-john-D...│ 10 │ │ http://wot/html?page/23600_m...│ 9 │ └────────────────────────────────┴───────┘ - 10 rows in set. Elapsed: 0.024 sec. # highlight-next-line Processed 8.02 million rows, 73.04 MB (340.26 million rows/s., 3.10 GB/s.) ``` - Server Log: ```response ...Executor): Key condition: (column 1 in [749927693, 749927693]) # highlight-next-line ...Executor): Used generic exclusion search over index for part all_1_9_2 - with 1453 steps +with 1453 steps ...Executor): Selected 1/1 parts by partition key, 1 parts by primary key, # highlight-next-line - 980/1083 marks by primary key, 980 marks to read from 23 ranges +980/1083 marks by primary key, 980 marks to read from 23 ranges ...Executor): Reading approx. 8028160 rows with 10 streams ```

- We now have two tables. Optimized for speeding up queries filtering on `UserIDs`, and speeding up queries filtering on URLs, respectively: - ### Option 2: Materialized Views {#option-2-materialized-views} Create a [materialized view](/sql-reference/statements/create/view.md) on our existing table. @@ -1094,46 +1024,46 @@ Ok. - if new rows are inserted into the source table hits_UserID_URL, then that rows are automatically also inserted into the implicitly created table - Effectively the implicitly created table has the same row order and primary index as the [secondary table that we created explicitly](/guides/best-practices/sparse-primary-indexes#option-1-secondary-tables): -Sparse Primary Indices 12b1 - -ClickHouse is storing the [column data files](#data-is-stored-on-disk-ordered-by-primary-key-columns) (*.bin), the [mark files](#mark-files-are-used-for-locating-granules) (*.mrk2) and the [primary index](#the-primary-index-has-one-entry-per-granule) (primary.idx) of the implicitly created table in a special folder withing the ClickHouse server's data directory: - -Sparse Primary Indices 12b2 - -::: - -The implicitly created table (and its primary index) backing the materialized view can now be used to significantly speed up the execution of our example query filtering on the URL column: -```sql -SELECT UserID, count(UserID) AS Count --- highlight-next-line -FROM mv_hits_URL_UserID -WHERE URL = 'http://public_search' -GROUP BY UserID -ORDER BY Count DESC -LIMIT 10; -``` - -The response is: - -```response -┌─────UserID─┬─Count─┐ -│ 2459550954 │ 3741 │ -│ 1084649151 │ 2484 │ -│ 723361875 │ 729 │ -│ 3087145896 │ 695 │ -│ 2754931092 │ 672 │ -│ 1509037307 │ 582 │ -│ 3085460200 │ 573 │ -│ 2454360090 │ 556 │ -│ 3884990840 │ 539 │ -│ 765730816 │ 536 │ -└────────────┴───────┘ - -10 rows in set. Elapsed: 0.026 sec. -# highlight-next-line -Processed 335.87 thousand rows, -13.54 MB (12.91 million rows/s., 520.38 MB/s.) -``` + Sparse Primary Indices 12b1 + + ClickHouse is storing the [column data files](#data-is-stored-on-disk-ordered-by-primary-key-columns) (*.bin), the [mark files](#mark-files-are-used-for-locating-granules) (*.mrk2) and the [primary index](#the-primary-index-has-one-entry-per-granule) (primary.idx) of the implicitly created table in a special folder withing the ClickHouse server's data directory: + + Sparse Primary Indices 12b2 + + ::: + + The implicitly created table (and its primary index) backing the materialized view can now be used to significantly speed up the execution of our example query filtering on the URL column: + ```sql + SELECT UserID, count(UserID) AS Count + -- highlight-next-line + FROM mv_hits_URL_UserID + WHERE URL = 'http://public_search' + GROUP BY UserID + ORDER BY Count DESC + LIMIT 10; + ``` + + The response is: + + ```response + ┌─────UserID─┬─Count─┐ + │ 2459550954 │ 3741 │ + │ 1084649151 │ 2484 │ + │ 723361875 │ 729 │ + │ 3087145896 │ 695 │ + │ 2754931092 │ 672 │ + │ 1509037307 │ 582 │ + │ 3085460200 │ 573 │ + │ 2454360090 │ 556 │ + │ 3884990840 │ 539 │ + │ 765730816 │ 536 │ + └────────────┴───────┘ + + 10 rows in set. Elapsed: 0.026 sec. + # highlight-next-line + Processed 335.87 thousand rows, + 13.54 MB (12.91 million rows/s., 520.38 MB/s.) + ``` Because effectively the implicitly created table (and its primary index) backing the materialized view is identical to the [secondary table that we created explicitly](/guides/best-practices/sparse-primary-indexes#option-1-secondary-tables), the query is executed in the same effective way as with the explicitly created table. @@ -1178,53 +1108,51 @@ ALTER TABLE hits_UserID_URL - please note that projections do not make queries that use ORDER BY more efficient, even if the ORDER BY matches the projection's ORDER BY statement (see https://github.com/ClickHouse/ClickHouse/issues/47333) - Effectively the implicitly created hidden table has the same row order and primary index as the [secondary table that we created explicitly](/guides/best-practices/sparse-primary-indexes#option-1-secondary-tables): -Sparse Primary Indices 12c1 - -ClickHouse is storing the [column data files](#data-is-stored-on-disk-ordered-by-primary-key-columns) (*.bin), the [mark files](#mark-files-are-used-for-locating-granules) (*.mrk2) and the [primary index](#the-primary-index-has-one-entry-per-granule) (primary.idx) of the hidden table in a special folder (marked in orange in the screenshot below) next to the source table's data files, mark files, and primary index files: - -Sparse Primary Indices 12c2 - -::: - - -The hidden table (and its primary index) created by the projection can now be (implicitly) used to significantly speed up the execution of our example query filtering on the URL column. Note that the query is syntactically targeting the source table of the projection. -```sql -SELECT UserID, count(UserID) AS Count --- highlight-next-line -FROM hits_UserID_URL -WHERE URL = 'http://public_search' -GROUP BY UserID -ORDER BY Count DESC -LIMIT 10; -``` - -The response is: - -```response -┌─────UserID─┬─Count─┐ -│ 2459550954 │ 3741 │ -│ 1084649151 │ 2484 │ -│ 723361875 │ 729 │ -│ 3087145896 │ 695 │ -│ 2754931092 │ 672 │ -│ 1509037307 │ 582 │ -│ 3085460200 │ 573 │ -│ 2454360090 │ 556 │ -│ 3884990840 │ 539 │ -│ 765730816 │ 536 │ -└────────────┴───────┘ - -10 rows in set. Elapsed: 0.029 sec. -# highlight-next-line -Processed 319.49 thousand rows, 1 -1.38 MB (11.05 million rows/s., 393.58 MB/s.) -``` + Sparse Primary Indices 12c1 + + ClickHouse is storing the [column data files](#data-is-stored-on-disk-ordered-by-primary-key-columns) (*.bin), the [mark files](#mark-files-are-used-for-locating-granules) (*.mrk2) and the [primary index](#the-primary-index-has-one-entry-per-granule) (primary.idx) of the hidden table in a special folder (marked in orange in the screenshot below) next to the source table's data files, mark files, and primary index files: + + Sparse Primary Indices 12c2 + + ::: + + The hidden table (and its primary index) created by the projection can now be (implicitly) used to significantly speed up the execution of our example query filtering on the URL column. Note that the query is syntactically targeting the source table of the projection. + ```sql + SELECT UserID, count(UserID) AS Count + -- highlight-next-line + FROM hits_UserID_URL + WHERE URL = 'http://public_search' + GROUP BY UserID + ORDER BY Count DESC + LIMIT 10; + ``` + + The response is: + + ```response + ┌─────UserID─┬─Count─┐ + │ 2459550954 │ 3741 │ + │ 1084649151 │ 2484 │ + │ 723361875 │ 729 │ + │ 3087145896 │ 695 │ + │ 2754931092 │ 672 │ + │ 1509037307 │ 582 │ + │ 3085460200 │ 573 │ + │ 2454360090 │ 556 │ + │ 3884990840 │ 539 │ + │ 765730816 │ 536 │ + └────────────┴───────┘ + + 10 rows in set. Elapsed: 0.029 sec. + # highlight-next-line + Processed 319.49 thousand rows, 1 + 1.38 MB (11.05 million rows/s., 393.58 MB/s.) + ``` Because effectively the hidden table (and its primary index) created by the projection is identical to the [secondary table that we created explicitly](/guides/best-practices/sparse-primary-indexes#option-1-secondary-tables), the query is executed in the same effective way as with the explicitly created table. The corresponding trace log in the ClickHouse server log file confirms that ClickHouse is running binary search over the index marks: - ```response ...Executor): Key condition: (column 0 in ['http://public_search', 'http://public_search']) @@ -1242,7 +1170,6 @@ The corresponding trace log in the ClickHouse server log file confirms that Clic ### Summary {#summary} - The primary index of our [table with compound primary key (UserID, URL)](#a-table-with-a-primary-key) was very useful for speeding up a [query filtering on UserID](#the-primary-index-is-used-for-selecting-granules). But that index is not providing significant help with speeding up a [query filtering on URL](/guides/best-practices/sparse-primary-indexes#secondary-key-columns-can-not-be-inefficient), despite the URL column being part of the compound primary key. And vice versa: @@ -1252,7 +1179,6 @@ Because of the similarly high cardinality of the primary key columns UserID and Therefore it makes sense to remove the second key column from the primary index (resulting in less memory consumption of the index) and to [use multiple primary indexes](/guides/best-practices/sparse-primary-indexes#using-multiple-primary-indexes) instead. - However if the key columns in a compound primary key have big differences in cardinality, then it is [beneficial for queries](/guides/best-practices/sparse-primary-indexes#generic-exclusion-search-algorithm) to order the primary key columns by cardinality in ascending order. The higher the cardinality difference between the key columns is, the more the order of those columns in the key matters. We will demonstrate that in the next section. @@ -1261,104 +1187,101 @@ The higher the cardinality difference between the key columns is, the more the o - In a compound primary key the order of the key columns can significantly influence both: - the efficiency of the filtering on secondary key columns in queries, and - the compression ratio for the table's data files. -In order to demonstrate that, we will use a version of our [web traffic sample data set](#data-set) -where each row contains three columns that indicate whether or not the access by an internet 'user' (`UserID` column) to a URL (`URL` column) got marked as bot traffic (`IsRobot` column). + In order to demonstrate that, we will use a version of our [web traffic sample data set](#data-set) + where each row contains three columns that indicate whether or not the access by an internet 'user' (`UserID` column) to a URL (`URL` column) got marked as bot traffic (`IsRobot` column). -We will use a compound primary key containing all three aforementioned columns that could be used to speed up typical web analytics queries that calculate + We will use a compound primary key containing all three aforementioned columns that could be used to speed up typical web analytics queries that calculate - how much (percentage of) traffic to a specific URL is from bots or - how confident we are that a specific user is (not) a bot (what percentage of traffic from that user is (not) assumed to be bot traffic) -We use this query for calculating the cardinalities of the three columns that we want to use as key columns in a compound primary key (note that we are using the [URL table function](/sql-reference/table-functions/url.md) for querying TSV data ad hoc without having to create a local table). Run this query in `clickhouse client`: -```sql -SELECT + We use this query for calculating the cardinalities of the three columns that we want to use as key columns in a compound primary key (note that we are using the [URL table function](/sql-reference/table-functions/url.md) for querying TSV data ad hoc without having to create a local table). Run this query in `clickhouse client`: + ```sql + SELECT formatReadableQuantity(uniq(URL)) AS cardinality_URL, formatReadableQuantity(uniq(UserID)) AS cardinality_UserID, formatReadableQuantity(uniq(IsRobot)) AS cardinality_IsRobot -FROM -( + FROM + ( SELECT c11::UInt64 AS UserID, c15::String AS URL, c20::UInt8 AS IsRobot FROM url('https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz') WHERE URL != '' -) -``` -The response is: -```response -┌─cardinality_URL─┬─cardinality_UserID─┬─cardinality_IsRobot─┐ -│ 2.39 million │ 119.08 thousand │ 4.00 │ -└─────────────────┴────────────────────┴─────────────────────┘ + ) + ``` + The response is: + ```response + ┌─cardinality_URL─┬─cardinality_UserID─┬─cardinality_IsRobot─┐ + │ 2.39 million │ 119.08 thousand │ 4.00 │ + └─────────────────┴────────────────────┴─────────────────────┘ -1 row in set. Elapsed: 118.334 sec. Processed 8.87 million rows, 15.88 GB (74.99 thousand rows/s., 134.21 MB/s.) -``` + 1 row in set. Elapsed: 118.334 sec. Processed 8.87 million rows, 15.88 GB (74.99 thousand rows/s., 134.21 MB/s.) + ``` -We can see that there is a big difference between the cardinalities, especially between the `URL` and `IsRobot` columns, and therefore the order of these columns in a compound primary key is significant for both the efficient speed up of queries filtering on that columns and for achieving optimal compression ratios for the table's column data files. + We can see that there is a big difference between the cardinalities, especially between the `URL` and `IsRobot` columns, and therefore the order of these columns in a compound primary key is significant for both the efficient speed up of queries filtering on that columns and for achieving optimal compression ratios for the table's column data files. -In order to demonstrate that we are creating two table versions for our bot traffic analysis data: + In order to demonstrate that we are creating two table versions for our bot traffic analysis data: - a table `hits_URL_UserID_IsRobot` with the compound primary key `(URL, UserID, IsRobot)` where we order the key columns by cardinality in descending order - a table `hits_IsRobot_UserID_URL` with the compound primary key `(IsRobot, UserID, URL)` where we order the key columns by cardinality in ascending order - -Create the table `hits_URL_UserID_IsRobot` with the compound primary key `(URL, UserID, IsRobot)`: -```sql -CREATE TABLE hits_URL_UserID_IsRobot -( + Create the table `hits_URL_UserID_IsRobot` with the compound primary key `(URL, UserID, IsRobot)`: + ```sql + CREATE TABLE hits_URL_UserID_IsRobot + ( `UserID` UInt32, `URL` String, `IsRobot` UInt8 -) -ENGINE = MergeTree --- highlight-next-line -PRIMARY KEY (URL, UserID, IsRobot); -``` - -And populate it with 8.87 million rows: -```sql -INSERT INTO hits_URL_UserID_IsRobot SELECT + ) + ENGINE = MergeTree + -- highlight-next-line + PRIMARY KEY (URL, UserID, IsRobot); + ``` + + And populate it with 8.87 million rows: + ```sql + INSERT INTO hits_URL_UserID_IsRobot SELECT intHash32(c11::UInt64) AS UserID, c15 AS URL, c20 AS IsRobot -FROM url('https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz') -WHERE URL != ''; -``` -This is the response: -```response -0 rows in set. Elapsed: 104.729 sec. Processed 8.87 million rows, 15.88 GB (84.73 thousand rows/s., 151.64 MB/s.) -``` - - -Next, create the table `hits_IsRobot_UserID_URL` with the compound primary key `(IsRobot, UserID, URL)`: -```sql -CREATE TABLE hits_IsRobot_UserID_URL -( + FROM url('https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz') + WHERE URL != ''; + ``` + This is the response: + ```response + 0 rows in set. Elapsed: 104.729 sec. Processed 8.87 million rows, 15.88 GB (84.73 thousand rows/s., 151.64 MB/s.) + ``` + + Next, create the table `hits_IsRobot_UserID_URL` with the compound primary key `(IsRobot, UserID, URL)`: + ```sql + CREATE TABLE hits_IsRobot_UserID_URL + ( `UserID` UInt32, `URL` String, `IsRobot` UInt8 -) -ENGINE = MergeTree --- highlight-next-line -PRIMARY KEY (IsRobot, UserID, URL); -``` -And populate it with the same 8.87 million rows that we used to populate the previous table: - -```sql -INSERT INTO hits_IsRobot_UserID_URL SELECT + ) + ENGINE = MergeTree + -- highlight-next-line + PRIMARY KEY (IsRobot, UserID, URL); + ``` + And populate it with the same 8.87 million rows that we used to populate the previous table: + + ```sql + INSERT INTO hits_IsRobot_UserID_URL SELECT intHash32(c11::UInt64) AS UserID, c15 AS URL, c20 AS IsRobot -FROM url('https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz') -WHERE URL != ''; -``` -The response is: -```response -0 rows in set. Elapsed: 95.959 sec. Processed 8.87 million rows, 15.88 GB (92.48 thousand rows/s., 165.50 MB/s.) -``` + FROM url('https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz') + WHERE URL != ''; + ``` + The response is: + ```response + 0 rows in set. Elapsed: 95.959 sec. Processed 8.87 million rows, 15.88 GB (92.48 thousand rows/s., 165.50 MB/s.) + ``` ### Efficient filtering on secondary key columns {#efficient-filtering-on-secondary-key-columns} @@ -1366,7 +1289,6 @@ When a query is filtering on at least one column that is part of a compound key, When a query is filtering (only) on a column that is part of a compound key, but is not the first key column, [then ClickHouse is using the generic exclusion search algorithm over the key column's index marks](/guides/best-practices/sparse-primary-indexes#secondary-key-columns-can-not-be-inefficient). - For the second case the ordering of the key columns in the compound primary key is significant for the effectiveness of the [generic exclusion search algorithm](https://github.com/ClickHouse/ClickHouse/blob/22.3/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp#L1444). This is a query that is filtering on the `UserID` column of the table where we ordered the key columns `(URL, UserID, IsRobot)` by cardinality in descending order: @@ -1409,7 +1331,6 @@ We can see that the query execution is significantly more effective and faster o The reason for that is that the [generic exclusion search algorithm](https://github.com/ClickHouse/ClickHouse/blob/22.3/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp#L1444) works most effective, when [granules](#the-primary-index-is-used-for-selecting-granules) are selected via a secondary key column where the predecessor key column has a lower cardinality. We illustrated that in detail in a [previous section](#generic-exclusion-search-algorithm) of this guide. - ### Optimal compression ratio of data files {#optimal-compression-ratio-of-data-files} This query compares the compression ratio of the `UserID` column between the two tables that we created above: @@ -1440,31 +1361,30 @@ Although in both tables exactly the same data is stored (we inserted the same 8. - in the table `hits_URL_UserID_IsRobot` with the compound primary key `(URL, UserID, IsRobot)` where we order the key columns by cardinality in descending order, the `UserID.bin` data file takes **11.24 MiB** of disk space - in the table `hits_IsRobot_UserID_URL` with the compound primary key `(IsRobot, UserID, URL)` where we order the key columns by cardinality in ascending order, the `UserID.bin` data file takes only **877.47 KiB** of disk space -Having a good compression ratio for the data of a table's column on disk not only saves space on disk, but also makes queries (especially analytical ones) that require the reading of data from that column faster, as less i/o is required for moving the column's data from disk to the main memory (the operating system's file cache). - -In the following we illustrate why it's beneficial for the compression ratio of a table's columns to order the primary key columns by cardinality in ascending order. + Having a good compression ratio for the data of a table's column on disk not only saves space on disk, but also makes queries (especially analytical ones) that require the reading of data from that column faster, as less i/o is required for moving the column's data from disk to the main memory (the operating system's file cache). -The diagram below sketches the on-disk order of rows for a primary key where the key columns are ordered by cardinality in ascending order: + In the following we illustrate why it's beneficial for the compression ratio of a table's columns to order the primary key columns by cardinality in ascending order. -Sparse Primary Indices 14a + The diagram below sketches the on-disk order of rows for a primary key where the key columns are ordered by cardinality in ascending order: -We discussed that [the table's row data is stored on disk ordered by primary key columns](#data-is-stored-on-disk-ordered-by-primary-key-columns). + Sparse Primary Indices 14a -In the diagram above, the table's rows (their column values on disk) are first ordered by their `cl` value, and rows that have the same `cl` value are ordered by their `ch` value. And because the first key column `cl` has low cardinality, it is likely that there are rows with the same `cl` value. And because of that it is also likely that `ch` values are ordered (locally - for rows with the same `cl` value). + We discussed that [the table's row data is stored on disk ordered by primary key columns](#data-is-stored-on-disk-ordered-by-primary-key-columns). -If in a column, similar data is placed close to each other, for example via sorting, then that data will be compressed better. -In general, a compression algorithm benefits from the run length of data (the more data it sees the better for compression) -and locality (the more similar the data is, the better the compression ratio is). + In the diagram above, the table's rows (their column values on disk) are first ordered by their `cl` value, and rows that have the same `cl` value are ordered by their `ch` value. And because the first key column `cl` has low cardinality, it is likely that there are rows with the same `cl` value. And because of that it is also likely that `ch` values are ordered (locally - for rows with the same `cl` value). -In contrast to the diagram above, the diagram below sketches the on-disk order of rows for a primary key where the key columns are ordered by cardinality in descending order: + If in a column, similar data is placed close to each other, for example via sorting, then that data will be compressed better. + In general, a compression algorithm benefits from the run length of data (the more data it sees the better for compression) + and locality (the more similar the data is, the better the compression ratio is). -Sparse Primary Indices 14b + In contrast to the diagram above, the diagram below sketches the on-disk order of rows for a primary key where the key columns are ordered by cardinality in descending order: -Now the table's rows are first ordered by their `ch` value, and rows that have the same `ch` value are ordered by their `cl` value. -But because the first key column `ch` has high cardinality, it is unlikely that there are rows with the same `ch` value. And because of that is is also unlikely that `cl` values are ordered (locally - for rows with the same `ch` value). + Sparse Primary Indices 14b -Therefore the `cl` values are most likely in random order and therefore have a bad locality and compression ration, respectively. + Now the table's rows are first ordered by their `ch` value, and rows that have the same `ch` value are ordered by their `cl` value. + But because the first key column `ch` has high cardinality, it is unlikely that there are rows with the same `ch` value. And because of that is is also unlikely that `cl` values are ordered (locally - for rows with the same `ch` value). + Therefore the `cl` values are most likely in random order and therefore have a bad locality and compression ration, respectively. ### Summary {#summary-1} @@ -1475,7 +1395,6 @@ For both the efficient filtering on secondary key columns in queries and the com Although in general it is [not](/knowledgebase/key-value) the best use case for ClickHouse, sometimes applications built on top of ClickHouse require to identify single rows of a ClickHouse table. - An intuitive solution for that might be to use a [UUID](https://en.wikipedia.org/wiki/Universally_unique_identifier) column with a unique value per row and for fast retrieval of rows to use that column as a primary key column. For the fastest retrieval, the UUID column [would need to be the first key column](#the-primary-index-is-used-for-selecting-granules). @@ -1496,25 +1415,24 @@ The following diagram shows - the insert order of rows when the content changes (for example because of keystrokes typing the text into the text-area) and - the on-disk order of the data from the inserted rows when the `PRIMARY KEY (hash)` is used: -Sparse Primary Indices 15a + Sparse Primary Indices 15a -Because the `hash` column is used as the primary key column + Because the `hash` column is used as the primary key column - specific rows can be retrieved [very quickly](#the-primary-index-is-used-for-selecting-granules), but - the table's rows (their column data) are stored on disk ordered ascending by (the unique and random) hash values. Therefore also the content column's values are stored in random order with no data locality resulting in a **suboptimal compression ratio for the content column data file**. - -In order to significantly improve the compression ratio for the content column while still achieving fast retrieval of specific rows, pastila.nl is using two hashes (and a compound primary key) for identifying a specific row: + In order to significantly improve the compression ratio for the content column while still achieving fast retrieval of specific rows, pastila.nl is using two hashes (and a compound primary key) for identifying a specific row: - a hash of the content, as discussed above, that is distinct for distinct data, and - a [locality-sensitive hash (fingerprint)](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) that does **not** change on small changes of data. -The following diagram shows + The following diagram shows - the insert order of rows when the content changes (for example because of keystrokes typing the text into the text-area) and - the on-disk order of the data from the inserted rows when the compound `PRIMARY KEY (fingerprint, hash)` is used: -Sparse Primary Indices 15b + Sparse Primary Indices 15b -Now the rows on disk are first ordered by `fingerprint`, and for rows with the same fingerprint value, their `hash` value determines the final order. + Now the rows on disk are first ordered by `fingerprint`, and for rows with the same fingerprint value, their `hash` value determines the final order. -Because data that differs only in small changes is getting the same fingerprint value, similar data is now stored on disk close to each other in the content column. And that is very good for the compression ratio of the content column, as a compression algorithm in general benefits from data locality (the more similar the data is the better the compression ratio is). + Because data that differs only in small changes is getting the same fingerprint value, similar data is now stored on disk close to each other in the content column. And that is very good for the compression ratio of the content column, as a compression algorithm in general benefits from data locality (the more similar the data is the better the compression ratio is). -The compromise is that two fields (`fingerprint` and `hash`) are required for the retrieval of a specific row in order to optimally utilise the primary index that results from the compound `PRIMARY KEY (fingerprint, hash)`. + The compromise is that two fields (`fingerprint` and `hash`) are required for the retrieval of a specific row in order to optimally utilise the primary index that results from the compound `PRIMARY KEY (fingerprint, hash)`. diff --git a/docs/guides/creating-tables.md b/docs/guides/creating-tables.md index fe1ecf915df..28f34bcaf25 100644 --- a/docs/guides/creating-tables.md +++ b/docs/guides/creating-tables.md @@ -15,7 +15,7 @@ CREATE DATABASE IF NOT EXISTS helloworld ``` Similarly, use `CREATE TABLE` to define a new table. If you do not specify the database name, the table will be in the -`default` database. +`default` database. The following table named `my_first_table` is created in the `helloworld` database: @@ -38,14 +38,14 @@ In the example above, `my_first_table` is a `MergeTree` table with four columns: - `timestamp`: a `DateTime` value, which represents an instant in time - `metric`: a 32-bit floating point number -:::note -The table engine determines: + :::note + The table engine determines: - How and where the data is stored - Which queries are supported - Whether or not the data is replicated -There are many engines to choose from, but for a simple table on a single-node ClickHouse server, [MergeTree](/engines/table-engines/mergetree-family/mergetree.md) is your likely choice. -::: + There are many engines to choose from, but for a simple table on a single-node ClickHouse server, [MergeTree](/engines/table-engines/mergetree-family/mergetree.md) is your likely choice. + ::: ## A brief intro to primary keys {#a-brief-intro-to-primary-keys} @@ -54,17 +54,17 @@ of primary keys might seem unexpected!): - primary keys in ClickHouse are **_not unique_** for each row in a table -The primary key of a ClickHouse table determines how the data is sorted when written to disk. Every 8,192 rows or 10MB of -data (referred to as the **index granularity**) creates an entry in the primary key index file. This granularity concept -creates a **sparse index** that can easily fit in memory, and the granules represent a stripe of the smallest amount of -column data that gets processed during `SELECT` queries. + The primary key of a ClickHouse table determines how the data is sorted when written to disk. Every 8,192 rows or 10MB of + data (referred to as the **index granularity**) creates an entry in the primary key index file. This granularity concept + creates a **sparse index** that can easily fit in memory, and the granules represent a stripe of the smallest amount of + column data that gets processed during `SELECT` queries. -The primary key can be defined using the `PRIMARY KEY` parameter. If you define a table without a `PRIMARY KEY` specified, -then the key becomes the tuple specified in the `ORDER BY` clause. If you specify both a `PRIMARY KEY` and an `ORDER BY`, the primary key must be a prefix of the sort order. + The primary key can be defined using the `PRIMARY KEY` parameter. If you define a table without a `PRIMARY KEY` specified, + then the key becomes the tuple specified in the `ORDER BY` clause. If you specify both a `PRIMARY KEY` and an `ORDER BY`, the primary key must be a prefix of the sort order. -The primary key is also the sorting key, which is a tuple of `(user_id, timestamp)`. Therefore, the data stored in each -column file will be sorted by `user_id`, then `timestamp`. + The primary key is also the sorting key, which is a tuple of `(user_id, timestamp)`. Therefore, the data stored in each + column file will be sorted by `user_id`, then `timestamp`. -:::tip -For more details, check out the [Modeling Data training module](https://learn.clickhouse.com/visitor_catalog_class/show/1328860/?utm_source=clickhouse&utm_medium=docs) in ClickHouse Academy. -::: + :::tip + For more details, check out the [Modeling Data training module](https://learn.clickhouse.com/visitor_catalog_class/show/1328860/?utm_source=clickhouse&utm_medium=docs) in ClickHouse Academy. + ::: diff --git a/docs/guides/developer/alternative-query-languages.md b/docs/guides/developer/alternative-query-languages.md index 9ca05f7ac0d..7b42d43831e 100644 --- a/docs/guides/developer/alternative-query-languages.md +++ b/docs/guides/developer/alternative-query-languages.md @@ -14,7 +14,7 @@ The currently supported dialects are: - `prql`: [Pipelined Relational Query Language (PRQL)](https://prql-lang.org/) - `kusto`: [Kusto Query Language (KQL)](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query) -Which query language is used is controlled by setting `dialect`. + Which query language is used is controlled by setting `dialect`. ## Standard SQL {#standard-sql} @@ -37,7 +37,6 @@ SET dialect = 'prql' Example PRQL query: - ```prql from trips aggregate { diff --git a/docs/guides/developer/cascading-materialized-views.md b/docs/guides/developer/cascading-materialized-views.md index 31f9aad7426..78f8a933dd2 100644 --- a/docs/guides/developer/cascading-materialized-views.md +++ b/docs/guides/developer/cascading-materialized-views.md @@ -22,13 +22,13 @@ Our Goal 1. We need the data aggregated by month for each domain name, 2. We also need the data aggregated by year for each domain name. -You could choose one of these options: + You could choose one of these options: - Write queries that will read and aggregate the data during the SELECT request - Prepare the data at the ingest time to a new format - Prepare the data at the time of ingest to a specific aggregation. -Preparing the data using Materialized views will allow you to limit the amount of data and calculation ClickHouse needs to do, making your SELECT requests faster. + Preparing the data using Materialized views will allow you to limit the amount of data and calculation ClickHouse needs to do, making your SELECT requests faster. ## Source table for the materialized views {#source-table-for-the-materialized-views} @@ -106,27 +106,27 @@ This step defines the cascade. The `FROM` statement will use the `monthly_aggreg 2. ClickHouse will forward the data received to the first materialized view `monthly_aggregated_data` table, 3. Finally, the data received in step 2 will be forwarded to the `year_aggregated_data`. -```sql -CREATE MATERIALIZED VIEW analytics.year_aggregated_data_mv -TO analytics.year_aggregated_data -AS -SELECT + ```sql + CREATE MATERIALIZED VIEW analytics.year_aggregated_data_mv + TO analytics.year_aggregated_data + AS + SELECT toYear(toStartOfYear(month)) AS year, domain_name, sumMerge(sumCountViews) AS sumCountViews -FROM analytics.monthly_aggregated_data -GROUP BY + FROM analytics.monthly_aggregated_data + GROUP BY domain_name, year -``` + ``` -:::note -A common misinterpretation when working with Materialized views is that data is read from the table, This is not how `Materialized views` work; the data forwarded is the inserted block, not the final result in your table. + :::note + A common misinterpretation when working with Materialized views is that data is read from the table, This is not how `Materialized views` work; the data forwarded is the inserted block, not the final result in your table. -Let's imagine in this example that the engine used in `monthly_aggregated_data` is a CollapsingMergeTree, the data forwarded to our second Materialized view `year_aggregated_data_mv` will not be the final result of the collapsed table, it will forward the block of data with the fields defined as in the `SELECT ... GROUP BY`. + Let's imagine in this example that the engine used in `monthly_aggregated_data` is a CollapsingMergeTree, the data forwarded to our second Materialized view `year_aggregated_data_mv` will not be the final result of the collapsed table, it will forward the block of data with the fields defined as in the `SELECT ... GROUP BY`. -If you are using CollapsingMergeTree, ReplacingMergeTree, or even SummingMergeTree and you plan to create a cascade Materialized view you need to understand the limitations described here. -::: + If you are using CollapsingMergeTree, ReplacingMergeTree, or even SummingMergeTree and you plan to create a cascade Materialized view you need to understand the limitations described here. + ::: ## Sample data {#sample-data} @@ -251,7 +251,6 @@ GROUP BY 2 rows in set. Elapsed: 0.004 sec. ``` - ## Combining multiple source tables to single target table {#combining-multiple-source-tables-to-single-target-table} Materialized views can also be used to combine multiple source tables into the same destination table. This is useful for creating a materialized view that is similar to a `UNION ALL` logic. diff --git a/docs/guides/developer/debugging-memory-issues.md b/docs/guides/developer/debugging-memory-issues.md index 100bdb45661..44b0191a4af 100644 --- a/docs/guides/developer/debugging-memory-issues.md +++ b/docs/guides/developer/debugging-memory-issues.md @@ -79,4 +79,3 @@ SELECT formatReadableSize(sum(index_granularity_bytes_in_memory_allocated)) AS index_granularity_bytes_in_memory_allocated FROM system.parts; ``` - diff --git a/docs/guides/developer/deduplicating-inserts-on-retries.md b/docs/guides/developer/deduplicating-inserts-on-retries.md index 05a156867d9..98589414756 100644 --- a/docs/guides/developer/deduplicating-inserts-on-retries.md +++ b/docs/guides/developer/deduplicating-inserts-on-retries.md @@ -55,10 +55,10 @@ You can control this process using the following settings for the source table: - [`replicated_deduplication_window_seconds`](/operations/settings/merge-tree-settings#replicated_deduplication_window_seconds) - [`non_replicated_deduplication_window`](/operations/settings/merge-tree-settings#non_replicated_deduplication_window) -You have to also enable the user profile setting [`deduplicate_blocks_in_dependent_materialized_views`](/operations/settings/settings#deduplicate_blocks_in_dependent_materialized_views). -With enabled setting `insert_deduplicate=1` an inserted data is deduplicated in source table. The setting `deduplicate_blocks_in_dependent_materialized_views=1` additionally enables deduplication in dependant tables. You have to enable both if full deduplication is desired. + You have to also enable the user profile setting [`deduplicate_blocks_in_dependent_materialized_views`](/operations/settings/settings#deduplicate_blocks_in_dependent_materialized_views). + With enabled setting `insert_deduplicate=1` an inserted data is deduplicated in source table. The setting `deduplicate_blocks_in_dependent_materialized_views=1` additionally enables deduplication in dependant tables. You have to enable both if full deduplication is desired. -When inserting blocks into tables under materialized views, ClickHouse calculates the `block_id` by hashing a string that combines the `block_id`s from the source table and additional identifiers. This ensures accurate deduplication within materialized views, allowing data to be distinguished based on its original insertion, regardless of any transformations applied before reaching the destination table under the materialized view. + When inserting blocks into tables under materialized views, ClickHouse calculates the `block_id` by hashing a string that combines the `block_id`s from the source table and additional identifiers. This ensures accurate deduplication within materialized views, allowing data to be distinguished based on its original insertion, regardless of any transformations applied before reaching the destination table under the materialized view. ## Examples {#examples} @@ -184,7 +184,6 @@ ENGINE = MergeTree ORDER BY tuple() SETTINGS non_replicated_deduplication_window=1000; - SET max_block_size=1; SET min_insert_block_size_rows=0; SET min_insert_block_size_bytes=0; diff --git a/docs/guides/developer/deduplication.md b/docs/guides/developer/deduplication.md index 24ad29c694e..0a89abe7557 100644 --- a/docs/guides/developer/deduplication.md +++ b/docs/guides/developer/deduplication.md @@ -9,7 +9,6 @@ title: 'Deduplication Strategies' import deduplication from '@site/static/images/guides/developer/de_duplication.png'; import Image from '@theme/IdealImage'; - # Deduplication strategies **Deduplication** refers to the process of ***removing duplicate rows of a dataset***. In an OLTP database, this is done easily because each row has a unique primary key-but at the cost of slower inserts. Every inserted row needs to first be searched for and, if found, needs to be replaced. @@ -20,13 +19,13 @@ ClickHouse is built for speed when it comes to data insertion. The storage files - The actual removal of duplicate rows occurs during the merging of parts - Your queries need to allow for the possibility of duplicates -
+
-||| -|------|----| -|Deduplication Logo|ClickHouse provides free training on deduplication and many other topics. The [Deleting and Updating Data training module](https://learn.clickhouse.com/visitor_catalog_class/show/1328954/?utm_source=clickhouse&utm_medium=docs) is a good place to start.| + ||| + |------|----| + |Deduplication Logo|ClickHouse provides free training on deduplication and many other topics. The [Deleting and Updating Data training module](https://learn.clickhouse.com/visitor_catalog_class/show/1328954/?utm_source=clickhouse&utm_medium=docs) is a good place to start.| -
+
## Options for deduplication {#options-for-deduplication} @@ -36,7 +35,7 @@ Deduplication is implemented in ClickHouse using the following table engines: 2. Collapsing rows: the `CollapsingMergeTree` and `VersionedCollapsingMergeTree` table engines use a logic where an existing row is "canceled" and a new row is inserted. They are more complex to implement than `ReplacingMergeTree`, but your queries and aggregations can be simpler to write without worrying about whether or not data has been merged yet. These two table engines are useful when you need to update data frequently. -We walk through both of these techniques below. For more details, check out our free on-demand [Deleting and Updating Data training module](https://learn.clickhouse.com/visitor_catalog_class/show/1328954/?utm_source=clickhouse&utm_medium=docs). + We walk through both of these techniques below. For more details, check out our free on-demand [Deleting and Updating Data training module](https://learn.clickhouse.com/visitor_catalog_class/show/1328954/?utm_source=clickhouse&utm_medium=docs). ## Using ReplacingMergeTree for Upserts {#using-replacingmergetree-for-upserts} @@ -140,7 +139,6 @@ FROM hackernews_rmt └────┴─────────┴─────────────────┴───────┘ ``` - Instead of using `FINAL`, let's use some business logic - we know that the `views` column is always increasing, so we can select the row with the largest value using the `max` function after grouping by the desired columns: ```sql @@ -191,62 +189,62 @@ What is the sign column of a `CollapsingMergeTree` table? It represents the _sta - Rows that cancel each other out are deleted during merges - Rows that do not have a matching pair are kept -Let's add a row to the `hackernews_views` table. Since it is the only row for this primary key, we set its state to 1: + Let's add a row to the `hackernews_views` table. Since it is the only row for this primary key, we set its state to 1: -```sql -INSERT INTO hackernews_views VALUES - (123, 'ricardo', 0, 1) -``` + ```sql + INSERT INTO hackernews_views VALUES + (123, 'ricardo', 0, 1) + ``` -Now suppose we want to change the views column. You insert two rows: one that cancels the existing row, and one that contains the new state of the row: + Now suppose we want to change the views column. You insert two rows: one that cancels the existing row, and one that contains the new state of the row: -```sql -INSERT INTO hackernews_views VALUES - (123, 'ricardo', 0, -1), - (123, 'ricardo', 150, 1) -``` + ```sql + INSERT INTO hackernews_views VALUES + (123, 'ricardo', 0, -1), + (123, 'ricardo', 150, 1) + ``` -The table now has 3 rows with the primary key `(123, 'ricardo')`: + The table now has 3 rows with the primary key `(123, 'ricardo')`: -```sql -SELECT * -FROM hackernews_views -``` + ```sql + SELECT * + FROM hackernews_views + ``` -```response -┌──id─┬─author──┬─views─┬─sign─┐ -│ 123 │ ricardo │ 0 │ -1 │ -│ 123 │ ricardo │ 150 │ 1 │ -└─────┴─────────┴───────┴──────┘ -┌──id─┬─author──┬─views─┬─sign─┐ -│ 123 │ ricardo │ 0 │ 1 │ -└─────┴─────────┴───────┴──────┘ -``` + ```response + ┌──id─┬─author──┬─views─┬─sign─┐ + │ 123 │ ricardo │ 0 │ -1 │ + │ 123 │ ricardo │ 150 │ 1 │ + └─────┴─────────┴───────┴──────┘ + ┌──id─┬─author──┬─views─┬─sign─┐ + │ 123 │ ricardo │ 0 │ 1 │ + └─────┴─────────┴───────┴──────┘ + ``` -Notice adding `FINAL` returns the current state row: + Notice adding `FINAL` returns the current state row: -```sql -SELECT * -FROM hackernews_views -FINAL -``` + ```sql + SELECT * + FROM hackernews_views + FINAL + ``` -```response -┌──id─┬─author──┬─views─┬─sign─┐ -│ 123 │ ricardo │ 150 │ 1 │ -└─────┴─────────┴───────┴──────┘ -``` + ```response + ┌──id─┬─author──┬─views─┬─sign─┐ + │ 123 │ ricardo │ 150 │ 1 │ + └─────┴─────────┴───────┴──────┘ + ``` -But of course, using `FINAL` is not recommended for large tables. + But of course, using `FINAL` is not recommended for large tables. -:::note -The value passed in for the `views` column in our example is not really needed, nor does it have to match the current value of `views` of the old row. In fact, you can cancel a row with just the primary key and a -1: + :::note + The value passed in for the `views` column in our example is not really needed, nor does it have to match the current value of `views` of the old row. In fact, you can cancel a row with just the primary key and a -1: -```sql -INSERT INTO hackernews_views(id, author, sign) VALUES - (123, 'ricardo', -1) -``` -::: + ```sql + INSERT INTO hackernews_views(id, author, sign) VALUES + (123, 'ricardo', -1) + ``` + ::: ## Real-time updates from multiple threads {#real-time-updates-from-multiple-threads} @@ -274,69 +272,69 @@ Notice the table uses `VersionsedCollapsingMergeTree` as the engine and passes i - The order that rows were inserted does not matter - Note that if the version column is not a part of the primary key, ClickHouse adds it to the primary key implicitly as the last field -You use the same type of logic when writing queries - group by the primary key and use clever logic to avoid rows that have been canceled but not deleted yet. Let's add some rows to the `hackernews_views_vcmt` table: + You use the same type of logic when writing queries - group by the primary key and use clever logic to avoid rows that have been canceled but not deleted yet. Let's add some rows to the `hackernews_views_vcmt` table: -```sql -INSERT INTO hackernews_views_vcmt VALUES - (1, 'ricardo', 0, 1, 1), - (2, 'ch_fan', 0, 1, 1), - (3, 'kenny', 0, 1, 1) -``` + ```sql + INSERT INTO hackernews_views_vcmt VALUES + (1, 'ricardo', 0, 1, 1), + (2, 'ch_fan', 0, 1, 1), + (3, 'kenny', 0, 1, 1) + ``` -Now we update two of the rows and delete one of them. To cancel a row, be sure to include the prior version number (since it is a part of the primary key): + Now we update two of the rows and delete one of them. To cancel a row, be sure to include the prior version number (since it is a part of the primary key): -```sql -INSERT INTO hackernews_views_vcmt VALUES - (1, 'ricardo', 0, -1, 1), - (1, 'ricardo', 50, 1, 2), - (2, 'ch_fan', 0, -1, 1), - (3, 'kenny', 0, -1, 1), - (3, 'kenny', 1000, 1, 2) -``` + ```sql + INSERT INTO hackernews_views_vcmt VALUES + (1, 'ricardo', 0, -1, 1), + (1, 'ricardo', 50, 1, 2), + (2, 'ch_fan', 0, -1, 1), + (3, 'kenny', 0, -1, 1), + (3, 'kenny', 1000, 1, 2) + ``` -We will run the same query as before that cleverly adds and subtracts values based on the sign column: + We will run the same query as before that cleverly adds and subtracts values based on the sign column: -```sql -SELECT + ```sql + SELECT id, author, sum(views * sign) -FROM hackernews_views_vcmt -GROUP BY (id, author) -HAVING sum(sign) > 0 -ORDER BY id ASC -``` - -The result is two rows: - -```response -┌─id─┬─author──┬─sum(multiply(views, sign))─┐ -│ 1 │ ricardo │ 50 │ -│ 3 │ kenny │ 1000 │ -└────┴─────────┴────────────────────────────┘ -``` - -Let's force a table merge: - -```sql -OPTIMIZE TABLE hackernews_views_vcmt -``` - -There should only be two rows in the result: - -```sql -SELECT * -FROM hackernews_views_vcmt -``` - -```response -┌─id─┬─author──┬─views─┬─sign─┬─version─┐ -│ 1 │ ricardo │ 50 │ 1 │ 2 │ -│ 3 │ kenny │ 1000 │ 1 │ 2 │ -└────┴─────────┴───────┴──────┴─────────┘ -``` - -A `VersionedCollapsingMergeTree` table is quite handy when you want to implement deduplication while inserting rows from multiple clients and/or threads. + FROM hackernews_views_vcmt + GROUP BY (id, author) + HAVING sum(sign) > 0 + ORDER BY id ASC + ``` + + The result is two rows: + + ```response + ┌─id─┬─author──┬─sum(multiply(views, sign))─┐ + │ 1 │ ricardo │ 50 │ + │ 3 │ kenny │ 1000 │ + └────┴─────────┴────────────────────────────┘ + ``` + + Let's force a table merge: + + ```sql + OPTIMIZE TABLE hackernews_views_vcmt + ``` + + There should only be two rows in the result: + + ```sql + SELECT * + FROM hackernews_views_vcmt + ``` + + ```response + ┌─id─┬─author──┬─views─┬─sign─┬─version─┐ + │ 1 │ ricardo │ 50 │ 1 │ 2 │ + │ 3 │ kenny │ 1000 │ 1 │ 2 │ + └────┴─────────┴───────┴──────┴─────────┘ + ``` + + A `VersionedCollapsingMergeTree` table is quite handy when you want to implement deduplication while inserting rows from multiple clients and/or threads. ## Why aren't my rows being deduplicated? {#why-arent-my-rows-being-deduplicated} diff --git a/docs/guides/developer/dynamic-column-selection.md b/docs/guides/developer/dynamic-column-selection.md index 277798d6cfa..cf54a2ae61a 100644 --- a/docs/guides/developer/dynamic-column-selection.md +++ b/docs/guides/developer/dynamic-column-selection.md @@ -38,34 +38,34 @@ This query returns the first 10 rows, but only for columns whose names match the 9. │ 5 │ 0 │ 0 │ 5.8 │ 10. │ 5 │ 0 │ 0 │ 5.8 │ └─────────────┴────────────┴──────────────┴──────────────┘ -``` + ``` -Let’s say we also want to return columns that contain the terms `fee` or `tax`. -We can update the regular expression to include those: + Let’s say we also want to return columns that contain the terms `fee` or `tax`. + We can update the regular expression to include those: -```sql -SELECT COLUMNS('.*_amount|fee|tax') -FROM nyc_taxi.trips -ORDER BY rand() -LIMIT 3; -``` + ```sql + SELECT COLUMNS('.*_amount|fee|tax') + FROM nyc_taxi.trips + ORDER BY rand() + LIMIT 3; + ``` -> [Try this query in the SQL playground](https://sql.clickhouse.com?query=U0VMRUNUIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykKRlJPTSBueWNfdGF4aS50cmlwcwpPUkRFUiBCWSByYW5kKCkgCkxJTUlUIDM7&run_query=true) + > [Try this query in the SQL playground](https://sql.clickhouse.com?query=U0VMRUNUIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykKRlJPTSBueWNfdGF4aS50cmlwcwpPUkRFUiBCWSByYW5kKCkgCkxJTUlUIDM7&run_query=true) -```text - ┌─fare_amount─┬─mta_tax─┬─tip_amount─┬─tolls_amount─┬─ehail_fee─┬─total_amount─┐ + ```text + ┌─fare_amount─┬─mta_tax─┬─tip_amount─┬─tolls_amount─┬─ehail_fee─┬─total_amount─┐ 1. │ 5 │ 0.5 │ 1 │ 0 │ 0 │ 7.8 │ 2. │ 12.5 │ 0.5 │ 0 │ 0 │ 0 │ 13.8 │ 3. │ 4.5 │ 0.5 │ 1.66 │ 0 │ 0 │ 9.96 │ - └─────────────┴─────────┴────────────┴──────────────┴───────────┴──────────────┘ -``` + └─────────────┴─────────┴────────────┴──────────────┴───────────┴──────────────┘ + ``` ## Selecting multiple patterns {#selecting-multiple-patterns} We can combine multiple column patterns in a single query: ```sql -SELECT +SELECT COLUMNS('.*_amount'), COLUMNS('.*_date.*') FROM nyc_taxi.trips @@ -81,12 +81,12 @@ LIMIT 5; 3. │ 3.5 │ 0 │ 0 │ 4.8 │ 2001-01-01 │ 2001-01-01 00:02:08 │ 2001-01-01 │ 2001-01-01 01:00:02 │ 4. │ 3.5 │ 0 │ 0 │ 4.8 │ 2001-01-01 │ 2001-01-01 00:02:08 │ 2001-01-01 │ 2001-01-01 01:00:02 │ 5. │ 3.5 │ 0 │ 0 │ 4.3 │ 2001-01-01 │ 2001-01-01 00:02:26 │ 2001-01-01 │ 2001-01-01 00:04:49 │ - └─────────────┴────────────┴──────────────┴──────────────┴─────────────┴─────────────────────┴──────────────┴─────────────────────┘ -``` + └─────────────┴────────────┴──────────────┴──────────────┴─────────────┴─────────────────────┴──────────────┴─────────────────────┘ + ``` ## Apply functions to all columns {#applying-functions} -We can also use the [`APPLY`](https://clickhouse.com/docs/sql-reference/statements/select#apply) modifier to apply functions across every column. +We can also use the [`APPLY`](https://clickhouse.com/docs/sql-reference/statements/select#apply) modifier to apply functions across every column. For example, if we wanted to find the maximum value of each of those columns, we could run the following query: ```sql @@ -96,110 +96,101 @@ FROM nyc_taxi.trips; > [Try this query in the SQL playground](https://sql.clickhouse.com?query=U0VMRUNUIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykgQVBQTFkobWF4KQpGUk9NIG55Y190YXhpLnRyaXBzOw&run_query=true) - ```text ┌─max(fare_amount)─┬─max(mta_tax)─┬─max(tip_amount)─┬─max(tolls_amount)─┬─max(ehail_fee)─┬─max(total_amount)─┐ 1. │ 998310 │ 500000.5 │ 3950588.8 │ 7999.92 │ 1.95 │ 3950611.5 │ - └──────────────────┴──────────────┴─────────────────┴───────────────────┴────────────────┴───────────────────┘ -``` + └──────────────────┴──────────────┴─────────────────┴───────────────────┴────────────────┴───────────────────┘ + ``` -Or maybe, we’d like to see the average instead: + Or maybe, we’d like to see the average instead: -```sql -SELECT COLUMNS('.*_amount|fee|tax') APPLY(avg) -FROM nyc_taxi.trips -``` + ```sql + SELECT COLUMNS('.*_amount|fee|tax') APPLY(avg) + FROM nyc_taxi.trips + ``` -> [Try this query in the SQL playground](https://sql.clickhouse.com?query=U0VMRUNUIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykgQVBQTFkoYXZnKQpGUk9NIG55Y190YXhpLnRyaXBzOw&run_query=true) + > [Try this query in the SQL playground](https://sql.clickhouse.com?query=U0VMRUNUIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykgQVBQTFkoYXZnKQpGUk9NIG55Y190YXhpLnRyaXBzOw&run_query=true) - -```text - ┌─avg(fare_amount)─┬───────avg(mta_tax)─┬────avg(tip_amount)─┬──avg(tolls_amount)─┬──────avg(ehail_fee)─┬──avg(total_amount)─┐ + ```text + ┌─avg(fare_amount)─┬───────avg(mta_tax)─┬────avg(tip_amount)─┬──avg(tolls_amount)─┬──────avg(ehail_fee)─┬──avg(total_amount)─┐ 1. │ 11.8044154834777 │ 0.4555942672733423 │ 1.3469850969211845 │ 0.2256511991414463 │ 3.37600560437412e-9 │ 14.423323722271563 │ - └──────────────────┴────────────────────┴────────────────────┴────────────────────┴─────────────────────┴────────────────────┘ -``` + └──────────────────┴────────────────────┴────────────────────┴────────────────────┴─────────────────────┴────────────────────┘ + ``` + Those values contain a lot of decimal places, but luckily we can fix that by chaining functions. In this case, we’ll apply the avg function, followed by the round function: -Those values contain a lot of decimal places, but luckily we can fix that by chaining functions. In this case, we’ll apply the avg function, followed by the round function: + ```sql + SELECT COLUMNS('.*_amount|fee|tax') APPLY(avg) APPLY(round) + FROM nyc_taxi.trips; + ``` -```sql -SELECT COLUMNS('.*_amount|fee|tax') APPLY(avg) APPLY(round) -FROM nyc_taxi.trips; -``` + > [Try this query in the SQL playground](https://sql.clickhouse.com?query=U0VMRUNUIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykgQVBQTFkoYXZnKSBBUFBMWShyb3VuZCkKRlJPTSBueWNfdGF4aS50cmlwczs&run_query=true) -> [Try this query in the SQL playground](https://sql.clickhouse.com?query=U0VMRUNUIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykgQVBQTFkoYXZnKSBBUFBMWShyb3VuZCkKRlJPTSBueWNfdGF4aS50cmlwczs&run_query=true) - - -```text - ┌─round(avg(fare_amount))─┬─round(avg(mta_tax))─┬─round(avg(tip_amount))─┬─round(avg(tolls_amount))─┬─round(avg(ehail_fee))─┬─round(avg(total_amount))─┐ + ```text + ┌─round(avg(fare_amount))─┬─round(avg(mta_tax))─┬─round(avg(tip_amount))─┬─round(avg(tolls_amount))─┬─round(avg(ehail_fee))─┬─round(avg(total_amount))─┐ 1. │ 12 │ 0 │ 1 │ 0 │ 0 │ 14 │ - └─────────────────────────┴─────────────────────┴────────────────────────┴──────────────────────────┴───────────────────────┴──────────────────────────┘ -``` - + └─────────────────────────┴─────────────────────┴────────────────────────┴──────────────────────────┴───────────────────────┴──────────────────────────┘ + ``` -But that rounds the averages to whole numbers. If we want to round to, say, 2 decimal places, we can do that as well. As well as taking in functions, the `APPLY` modifier accepts a lambda, which gives us the flexibility to have the round function round our average values to 2 decimal places: - -```sql -SELECT COLUMNS('.*_amount|fee|tax') APPLY(avg) APPLY(x -> round(x, 2)) -FROM nyc_taxi.trips; -``` + But that rounds the averages to whole numbers. If we want to round to, say, 2 decimal places, we can do that as well. As well as taking in functions, the `APPLY` modifier accepts a lambda, which gives us the flexibility to have the round function round our average values to 2 decimal places: -> [Try this query in the SQL playground](https://sql.clickhouse.com?query=U0VMRUNUIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykgQVBQTFkgYXZnIEFQUExZIHggLT4gcm91bmQoeCwgMikKRlJPTSBueWNfdGF4aS50cmlwcw&run_query=true) + ```sql + SELECT COLUMNS('.*_amount|fee|tax') APPLY(avg) APPLY(x -> round(x, 2)) + FROM nyc_taxi.trips; + ``` + > [Try this query in the SQL playground](https://sql.clickhouse.com?query=U0VMRUNUIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykgQVBQTFkgYXZnIEFQUExZIHggLT4gcm91bmQoeCwgMikKRlJPTSBueWNfdGF4aS50cmlwcw&run_query=true) -```text - ┌─round(avg(fare_amount), 2)─┬─round(avg(mta_tax), 2)─┬─round(avg(tip_amount), 2)─┬─round(avg(tolls_amount), 2)─┬─round(avg(ehail_fee), 2)─┬─round(avg(total_amount), 2)─┐ + ```text + ┌─round(avg(fare_amount), 2)─┬─round(avg(mta_tax), 2)─┬─round(avg(tip_amount), 2)─┬─round(avg(tolls_amount), 2)─┬─round(avg(ehail_fee), 2)─┬─round(avg(total_amount), 2)─┐ 1. │ 11.8 │ 0.46 │ 1.35 │ 0.23 │ 0 │ 14.42 │ - └────────────────────────────┴────────────────────────┴───────────────────────────┴─────────────────────────────┴──────────────────────────┴─────────────────────────────┘ -``` + └────────────────────────────┴────────────────────────┴───────────────────────────┴─────────────────────────────┴──────────────────────────┴─────────────────────────────┘ + ``` ## Replacing columns {#replacing-columns} So far so good. But let’s say we want to adjust one of the values, while leaving the other ones as they are. For example, maybe we want to double the total amount and divide the MTA tax by 1.1. We can do that by using the [`REPLACE`](/sql-reference/statements/select#replace) modifier, which will replace a column while leaving the other ones as they are. ```sql -FROM nyc_taxi.trips -SELECT +FROM nyc_taxi.trips +SELECT COLUMNS('.*_amount|fee|tax') REPLACE( total_amount*2 AS total_amount, mta_tax/1.1 AS mta_tax - ) + ) APPLY(avg) APPLY(col -> round(col, 2)); ``` > [Try this query in the SQL playground](https://sql.clickhouse.com?query=RlJPTSBueWNfdGF4aS50cmlwcyAKU0VMRUNUIAogIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykKICBSRVBMQUNFKAogICAgdG90YWxfYW1vdW50KjIgQVMgdG90YWxfYW1vdW50LAogICAgbXRhX3RheC8xLjEgQVMgbXRhX3RheAogICkgCiAgQVBQTFkoYXZnKQogIEFQUExZKGNvbCAtPiByb3VuZChjb2wsIDIpKTs&run_query=true) - ```text ┌─round(avg(fare_amount), 2)─┬─round(avg(di⋯, 1.1)), 2)─┬─round(avg(tip_amount), 2)─┬─round(avg(tolls_amount), 2)─┬─round(avg(ehail_fee), 2)─┬─round(avg(mu⋯nt, 2)), 2)─┐ 1. │ 11.8 │ 0.41 │ 1.35 │ 0.23 │ 0 │ 28.85 │ - └────────────────────────────┴──────────────────────────┴───────────────────────────┴─────────────────────────────┴──────────────────────────┴──────────────────────────┘ -``` + └────────────────────────────┴──────────────────────────┴───────────────────────────┴─────────────────────────────┴──────────────────────────┴──────────────────────────┘ + ``` ## Excluding columns {#excluding-columns} We can also choose to exclude a field by using the [`EXCEPT`](/sql-reference/statements/select#except) modifier. For example, to remove the `tolls_amount` column, we would write the following query: ```sql -FROM nyc_taxi.trips -SELECT +FROM nyc_taxi.trips +SELECT COLUMNS('.*_amount|fee|tax') EXCEPT(tolls_amount) REPLACE( total_amount*2 AS total_amount, mta_tax/1.1 AS mta_tax - ) + ) APPLY(avg) APPLY(col -> round(col, 2)); ``` > [Try this query in the SQL playground](https://sql.clickhouse.com?query=RlJPTSBueWNfdGF4aS50cmlwcyAKU0VMRUNUIAogIENPTFVNTlMoJy4qX2Ftb3VudHxmZWV8dGF4JykgRVhDRVBUKHRvbGxzX2Ftb3VudCkKICBSRVBMQUNFKAogICAgdG90YWxfYW1vdW50KjIgQVMgdG90YWxfYW1vdW50LAogICAgbXRhX3RheC8xLjEgQVMgbXRhX3RheAogICkgCiAgQVBQTFkoYXZnKQogIEFQUExZKGNvbCAtPiByb3VuZChjb2wsIDIpKTs&run_query=true) - - ```text ┌─round(avg(fare_amount), 2)─┬─round(avg(di⋯, 1.1)), 2)─┬─round(avg(tip_amount), 2)─┬─round(avg(ehail_fee), 2)─┬─round(avg(mu⋯nt, 2)), 2)─┐ 1. │ 11.8 │ 0.41 │ 1.35 │ 0 │ 28.85 │ - └────────────────────────────┴──────────────────────────┴───────────────────────────┴──────────────────────────┴──────────────────────────┘ -``` \ No newline at end of file + └────────────────────────────┴──────────────────────────┴───────────────────────────┴──────────────────────────┴──────────────────────────┘ + ``` diff --git a/docs/guides/developer/lightweight-update.md b/docs/guides/developer/lightweight-update.md index 21b47bf6489..15543d1e150 100644 --- a/docs/guides/developer/lightweight-update.md +++ b/docs/guides/developer/lightweight-update.md @@ -90,5 +90,4 @@ These behaviours are controlled by the following settings: - `mutations_execute_nondeterministic_on_initiator` - if true, non-deterministic functions are executed on the initiator replica and are replaced as literals in `UPDATE` and `DELETE` queries. Default value: `false`. - `mutations_execute_subqueries_on_initiator` - if true, scalar subqueries are executed on the initiator replica and are replaced as literals in `UPDATE` and `DELETE` queries. Default value: `false`. - - `mutations_max_literal_size_to_replace` - The maximum size of serialized literals in bytes to replace in `UPDATE` and `DELETE` queries. Default value: `16384` (16 KiB). - + - `mutations_max_literal_size_to_replace` - The maximum size of serialized literals in bytes to replace in `UPDATE` and `DELETE` queries. Default value: `16384` (16 KiB). diff --git a/docs/guides/developer/merge-table-function.md b/docs/guides/developer/merge-table-function.md index 4a3128c0903..573f61d5c34 100644 --- a/docs/guides/developer/merge-table-function.md +++ b/docs/guides/developer/merge-table-function.md @@ -22,13 +22,13 @@ The import statements are shown below: CREATE OR REPLACE TABLE atp_matches_1960s ORDER BY tourney_id AS SELECT tourney_id, surface, winner_name, loser_name, winner_seed, loser_seed, score FROM url('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/refs/heads/master/atp_matches_{1968..1969}.csv') -SETTINGS schema_inference_make_columns_nullable=0, +SETTINGS schema_inference_make_columns_nullable=0, schema_inference_hints='winner_seed Nullable(String), loser_seed Nullable(UInt8)'; -CREATE OR REPLACE TABLE atp_matches_1970s ORDER BY tourney_id AS +CREATE OR REPLACE TABLE atp_matches_1970s ORDER BY tourney_id AS SELECT tourney_id, surface, winner_name, loser_name, winner_seed, loser_seed, splitByWhitespace(score) AS score FROM url('https://raw.githubusercontent.com/JeffSackmann/tennis_atp/refs/heads/master/atp_matches_{1970..1979}.csv') -SETTINGS schema_inference_make_columns_nullable=0, +SETTINGS schema_inference_make_columns_nullable=0, schema_inference_hints='winner_seed Nullable(UInt8), loser_seed Nullable(UInt8)'; CREATE OR REPLACE TABLE atp_matches_1980s ORDER BY tourney_id AS @@ -47,7 +47,7 @@ SETTINGS schema_inference_make_columns_nullable=0, ``` ## Schema of multiple tables {#schema-multiple-tables} - + We can run the following query to list the columns in each table along with their types side by side, so that it's easier to see the differences. ```sql @@ -169,7 +169,6 @@ AND multiIf( We could also use this virtual column as part of a query to count the values for the `walkover` column: - ```sql SELECT _table, walkover, count() FROM merge('atp_matches*') @@ -190,7 +189,6 @@ ORDER BY _table; We can see that the `walkover` column is `NULL` for everything except `atp_matches_1990s`. We'll need to update our query to check whether the `score` column contains the string `W/O` if the `walkover` column is `NULL`: - ```sql SELECT _table, multiIf( @@ -211,7 +209,6 @@ ORDER BY _table; If the underlying type of `score` is `Array(String)` we have to go over the array and look for `W/O`, whereas if it has a type of `String` we can just search for `W/O` in the string. - ```text ┌─_table────────────┬─multiIf(isNo⋯, '%W/O%'))─┬─count()─┐ │ atp_matches_1960s │ true │ 242 │ @@ -223,4 +220,4 @@ If the underlying type of `score` is `Array(String)` we have to go over the arra │ atp_matches_1990s │ true │ 128 │ │ atp_matches_1990s │ false │ 37022 │ └───────────────────┴──────────────────────────┴─────────┘ -``` \ No newline at end of file +``` diff --git a/docs/guides/developer/mutations.md b/docs/guides/developer/mutations.md index 6f19b0cadd7..6e38d7cdf48 100644 --- a/docs/guides/developer/mutations.md +++ b/docs/guides/developer/mutations.md @@ -10,11 +10,11 @@ show_related_blogs: false # Updating and deleting ClickHouse data with mutations -Although ClickHouse is geared toward high volume analytic workloads, it is possible in some situations to modify or +Although ClickHouse is geared toward high volume analytic workloads, it is possible in some situations to modify or delete existing data. These operations are labeled "mutations" and are executed using the `ALTER TABLE` command. :::tip -If you need to perform frequent updates, consider using [deduplication](../developer/deduplication.md) in ClickHouse, which allows you to update +If you need to perform frequent updates, consider using [deduplication](../developer/deduplication.md) in ClickHouse, which allows you to update and/or delete rows without generating a mutation event. Alternatively, use [lightweight updates](/guides/developer/lightweight-update) or [lightweight deletes](/guides/developer/lightweight-delete) ::: @@ -31,7 +31,7 @@ ALTER TABLE [.] UPDATE = WHERE .]
UPDATE = WHERE .]
UPDATE = WHERE .]
UPDATE = WHERE ` should return a UInt8 value for each row of data. ALTER TABLE website.clicks DELETE WHERE visitor_id in (253, 1002, 4277) ``` -2. What does this query alter? +2. What does this query alter? ```sql ALTER TABLE clicks ON CLUSTER main_cluster DELETE WHERE visit_date < '2022-01-02 15:00:00' AND page_id = '573' ``` -:::note -To delete all of the data in a table, it is more efficient to use the command `TRUNCATE TABLE [` command. This command can also be executed `ON CLUSTER`. -::: + :::note + To delete all of the data in a table, it is more efficient to use the command `TRUNCATE TABLE [` command. This command can also be executed `ON CLUSTER`. + ::: -View the [`DELETE` statement](/sql-reference/statements/delete.md) docs page for more details. + View the [`DELETE` statement](/sql-reference/statements/delete.md) docs page for more details. ## Lightweight deletes {#lightweight-deletes} @@ -104,4 +104,3 @@ DELETE FROM hits WHERE Title LIKE '%hello%'; A few notes about lightweight deletes: - This feature is only available for the `MergeTree` table engine family. - Lightweight deletes are asynchronous by default. Set `mutations_sync` equal to 1 to wait for one replica to process the statement, and set `mutations_sync` to 2 to wait for all replicas. - diff --git a/docs/guides/developer/replacing-merge-tree.md b/docs/guides/developer/replacing-merge-tree.md index cb9d33b01ea..53b0618de4d 100644 --- a/docs/guides/developer/replacing-merge-tree.md +++ b/docs/guides/developer/replacing-merge-tree.md @@ -25,30 +25,24 @@ During this process, the following occurs during part merging: - The row identified by the value 2 for column A has two update rows. The latter row is retained with a value of 6 for the price column. - The row identified by the value 3 for column A has a row with version 1 and a delete row with version 2. This delete row is retained. -As a result of this merge process, we have four rows representing the final state: + As a result of this merge process, we have four rows representing the final state: -
+ - - -
- -Note that deleted rows are never removed. They can be forcibly deleted with an `OPTIMIZE table FINAL CLEANUP`. This requires the experimental setting `allow_experimental_replacing_merge_with_cleanup=1`. This should only be issued under the following conditions: + Note that deleted rows are never removed. They can be forcibly deleted with an `OPTIMIZE table FINAL CLEANUP`. This requires the experimental setting `allow_experimental_replacing_merge_with_cleanup=1`. This should only be issued under the following conditions: 1. You can be sure that no rows with old versions (for those that are being deleted with the cleanup) will be inserted after the operation is issued. If these are inserted, they will be incorrectly retained, as the deleted rows will no longer be present. 2. Ensure all replicas are in sync prior to issuing the cleanup. This can be achieved with the command: -
- -```sql -SYSTEM SYNC REPLICA table -``` + ```sql + SYSTEM SYNC REPLICA table + ``` -We recommend pausing inserts once (1) is guaranteed and until this command and the subsequent cleanup are complete. + We recommend pausing inserts once (1) is guaranteed and until this command and the subsequent cleanup are complete. -> Handling deletes with the ReplacingMergeTree is only recommended for tables with a low to moderate number of deletes (less than 10%) unless periods can be scheduled for cleanup with the above conditions. + > Handling deletes with the ReplacingMergeTree is only recommended for tables with a low to moderate number of deletes (less than 10%) unless periods can be scheduled for cleanup with the above conditions. -> Tip: Users may also be able to issue `OPTIMIZE FINAL CLEANUP` against selective partitions no longer subject to changes. + > Tip: Users may also be able to issue `OPTIMIZE FINAL CLEANUP` against selective partitions no longer subject to changes. ## Choosing a primary/deduplication key {#choosing-a-primarydeduplication-key} diff --git a/docs/guides/developer/time-series-filling-gaps.md b/docs/guides/developer/time-series-filling-gaps.md index 0d032390062..3467fea9e20 100644 --- a/docs/guides/developer/time-series-filling-gaps.md +++ b/docs/guides/developer/time-series-filling-gaps.md @@ -118,7 +118,6 @@ We can see that the gaps have been filled with 0 values in the `count` column. There is, however, still a gap at the beginning of the time range, which we can fix by specifying `FROM`: - ```sql SELECT toStartOfInterval(timestamp, toIntervalMillisecond(100)) AS bucket, @@ -207,7 +206,7 @@ The gaps have all now been filled and we have entries for every 100 ms from `00: ## Cumulative count {#cumulative-count} -Let's say we now want to keep a cumulative count of the number of images created across the buckets. +Let's say we now want to keep a cumulative count of the number of images created across the buckets. We can do this by adding a `cumulative` column, as shown below: ```sql @@ -300,7 +299,7 @@ INTERPOLATE (cumulative); └─────────────────────────┴───────┴────────────┘ ``` -That looks much better. +That looks much better. And now to finish it off, let's add a bar chart using the `bar` function, not forgetting to add our new column to the `INTERPOLATE` clause. ```sql diff --git a/docs/guides/developer/ttl.md b/docs/guides/developer/ttl.md index 49361a6c6cc..55e11352506 100644 --- a/docs/guides/developer/ttl.md +++ b/docs/guides/developer/ttl.md @@ -20,9 +20,9 @@ TTL (time-to-live) refers to the capability of having rows or columns moved, del - Moving data between disks: after a certain amount of time, you can move data between storage volumes - useful for deploying a hot/warm/cold architecture - Data rollup: rollup your older data into various useful aggregations and computations before deleting it -:::note -TTL can be applied to entire tables or specific columns. -::: + :::note + TTL can be applied to entire tables or specific columns. + ::: ## TTL syntax {#ttl-syntax} @@ -44,9 +44,9 @@ ORDER BY tuple() - The y column has a time to live of 1 day from the timestamp column - When the interval lapses, the column expires. ClickHouse replaces the column value with the default value of its data type. If all the column values in the data part expire, ClickHouse deletes this column from the data part in the filesystem. -:::note -TTL rules can be altered or deleted. See the [Manipulations with Table TTL](/sql-reference/statements/alter/ttl.md) page for more details. -::: + :::note + TTL rules can be altered or deleted. See the [Manipulations with Table TTL](/sql-reference/statements/alter/ttl.md) page for more details. + ::: ## Triggering TTL events {#triggering-ttl-events} @@ -55,17 +55,17 @@ The deleting or aggregating of expired rows is not immediate - it only occurs du - `merge_with_ttl_timeout`: the minimum delay in seconds before repeating a merge with delete TTL. The default is 14400 seconds (4 hours). - `merge_with_recompression_ttl_timeout`: the minimum delay in seconds before repeating a merge with recompression TTL (rules that roll up data before deleting). Default value: 14400 seconds (4 hours). -So by default, your TTL rules will be applied to your table at least once every 4 hours. Just modify the settings above if you need your TTL rules applied more frequently. + So by default, your TTL rules will be applied to your table at least once every 4 hours. Just modify the settings above if you need your TTL rules applied more frequently. -:::note -Not a great solution (or one that we recommend you use frequently), but you can also force a merge using `OPTIMIZE`: + :::note + Not a great solution (or one that we recommend you use frequently), but you can also force a merge using `OPTIMIZE`: -```sql -OPTIMIZE TABLE example1 FINAL -``` + ```sql + OPTIMIZE TABLE example1 FINAL + ``` -`OPTIMIZE` initializes an unscheduled merge of the parts of your table, and `FINAL` forces a reoptimization if your table is already a single part. -::: + `OPTIMIZE` initializes an unscheduled merge of the parts of your table, and `FINAL` forces a reoptimization if your table is already a single part. + ::: ## Removing rows {#removing-rows} @@ -84,7 +84,7 @@ TTL timestamp + INTERVAL 12 HOUR ``` Additionally, it is possible to define a TTL rule based on the record's value. -This is easily implemented by specifying a where condition. +This is easily implemented by specifying a where condition. Multiple conditions are allowed: ```sql @@ -150,8 +150,8 @@ A common practice when working with large amounts of data is to move that data a 1. The `TO DISK` and `TO VOLUME` options refer to the names of disks or volumes defined in your ClickHouse configuration files. Create a new file named `my_system.xml` (or any file name) that defines your disks, then define volumes that use your disks. Place the XML file in `/etc/clickhouse-server/config.d/` to have the configuration applied to your system: -```xml - + ```xml + @@ -185,77 +185,77 @@ A common practice when working with large amounts of data is to move that data a - -``` + + ``` 2. The configuration above refers to three disks that point to folders that ClickHouse can read from and write to. Volumes can contain one or more disks - we defined a volume for each of the three disks. Let's view the disks: -```sql -SELECT name, path, free_space, total_space -FROM system.disks -``` + ```sql + SELECT name, path, free_space, total_space + FROM system.disks + ``` -```response -┌─name────────┬─path───────────┬───free_space─┬──total_space─┐ -│ cold_disk │ ./data/cold/ │ 179143311360 │ 494384795648 │ -│ default │ ./ │ 179143311360 │ 494384795648 │ -│ hot_disk │ ./data/hot/ │ 179143311360 │ 494384795648 │ -│ warm_disk │ ./data/warm/ │ 179143311360 │ 494384795648 │ -└─────────────┴────────────────┴──────────────┴──────────────┘ -``` + ```response + ┌─name────────┬─path───────────┬───free_space─┬──total_space─┐ + │ cold_disk │ ./data/cold/ │ 179143311360 │ 494384795648 │ + │ default │ ./ │ 179143311360 │ 494384795648 │ + │ hot_disk │ ./data/hot/ │ 179143311360 │ 494384795648 │ + │ warm_disk │ ./data/warm/ │ 179143311360 │ 494384795648 │ + └─────────────┴────────────────┴──────────────┴──────────────┘ + ``` 3. And...let's verify the volumes: -```sql -SELECT + ```sql + SELECT volume_name, disks -FROM system.storage_policies -``` - -```response -┌─volume_name─┬─disks─────────┐ -│ default │ ['default'] │ -│ hot_volume │ ['hot_disk'] │ -│ warm_volume │ ['warm_disk'] │ -│ cold_volume │ ['cold_disk'] │ -└─────────────┴───────────────┘ -``` + FROM system.storage_policies + ``` + + ```response + ┌─volume_name─┬─disks─────────┐ + │ default │ ['default'] │ + │ hot_volume │ ['hot_disk'] │ + │ warm_volume │ ['warm_disk'] │ + │ cold_volume │ ['cold_disk'] │ + └─────────────┴───────────────┘ + ``` 4. Now we will add a `TTL` rule that moves the data between the hot, warm and cold volumes: -```sql -ALTER TABLE my_table - MODIFY TTL + ```sql + ALTER TABLE my_table + MODIFY TTL trade_date TO VOLUME 'hot_volume', trade_date + INTERVAL 2 YEAR TO VOLUME 'warm_volume', trade_date + INTERVAL 4 YEAR TO VOLUME 'cold_volume'; -``` + ``` 5. The new `TTL` rule should materialize, but you can force it to make sure: -```sql -ALTER TABLE my_table + ```sql + ALTER TABLE my_table MATERIALIZE TTL -``` + ``` 6. Verify your data has moved to its expected disks using the `system.parts` table: -```sql -Using the system.parts table, view which disks the parts are on for the crypto_prices table: + ```sql + Using the system.parts table, view which disks the parts are on for the crypto_prices table: -SELECT + SELECT name, disk_name -FROM system.parts -WHERE (table = 'my_table') AND (active = 1) -``` - -The response will look like: - -```response -┌─name────────┬─disk_name─┐ -│ all_1_3_1_5 │ warm_disk │ -│ all_2_2_0 │ hot_disk │ -└─────────────┴───────────┘ -``` + FROM system.parts + WHERE (table = 'my_table') AND (active = 1) + ``` + + The response will look like: + + ```response + ┌─name────────┬─disk_name─┐ + │ all_1_3_1_5 │ warm_disk │ + │ all_2_2_0 │ hot_disk │ + └─────────────┴───────────┘ + ``` diff --git a/docs/guides/developer/understanding-query-execution-with-the-analyzer.md b/docs/guides/developer/understanding-query-execution-with-the-analyzer.md index 24620bf6eb1..07b70028b3e 100644 --- a/docs/guides/developer/understanding-query-execution-with-the-analyzer.md +++ b/docs/guides/developer/understanding-query-execution-with-the-analyzer.md @@ -79,7 +79,6 @@ The new architecture should provide us with a better framework to improve ClickH The analyzer is an important step of the query execution. It takes an AST and transforms it into a query tree. The main benefit of a query tree over an AST is that a lot of the components will be resolved, like the storage for instance. We also know from which table to read, aliases are also resolved, and the tree knows the different data types used. With all these benefits, the analyzer can apply optimizations. The way these optimizations work is via "passes". Every pass is going to look for different optimizations. You can see all the passes [here](https://github.com/ClickHouse/ClickHouse/blob/76578ebf92af3be917cd2e0e17fea2965716d958/src/Analyzer/QueryTreePassManager.cpp#L249), let's see it in practice with our previous query: - ```sql EXPLAIN QUERY TREE passes=0 SELECT min(timestamp) AS minimum_date, max(timestamp) AS maximum_date FROM session_events SETTINGS allow_experimental_analyzer=1; @@ -127,7 +126,6 @@ EXPLAIN QUERY TREE passes=20 SELECT min(timestamp) AS minimum_date, max(timestam Between the two executions, you can see the resolution of aliases and projections. - ## Planner {#planner} The planner takes a query tree and builds a query plan out of it. The query tree tells us what we want to do with a specific query, and the query plan tells us how we will do it. Additional optimizations are going to be done as part of the query plan. You can use `EXPLAIN PLAN` or `EXPLAIN` to see the query plan (`EXPLAIN` will execute `EXPLAIN PLAN`). @@ -200,7 +198,6 @@ SELECT FROM session_events GROUP BY type - ┌─explain────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Expression ((Projection + Before ORDER BY)) │ │ Actions: INPUT :: 0 -> type String : 0 │ diff --git a/docs/guides/examples/aggregate_function_combinators/anyIf.md b/docs/guides/examples/aggregate_function_combinators/anyIf.md index a0368717512..d55deb5b9a3 100644 --- a/docs/guides/examples/aggregate_function_combinators/anyIf.md +++ b/docs/guides/examples/aggregate_function_combinators/anyIf.md @@ -27,7 +27,7 @@ CREATE TABLE sales( transaction_id UInt32, amount Decimal(10,2), is_successful UInt8 -) +) ENGINE = MergeTree() ORDER BY tuple(); diff --git a/docs/guides/examples/aggregate_function_combinators/argMaxIf.md b/docs/guides/examples/aggregate_function_combinators/argMaxIf.md index f3ebb5dc512..beb42c6cb7c 100644 --- a/docs/guides/examples/aggregate_function_combinators/argMaxIf.md +++ b/docs/guides/examples/aggregate_function_combinators/argMaxIf.md @@ -15,13 +15,13 @@ function to find the value of `arg` that corresponds to the maximum value of `va using the `argMaxIf` aggregate combinator function. The `argMaxIf` function is useful when you need to find the value associated with -the maximum value in a dataset, but only for rows that satisfy a specific +the maximum value in a dataset, but only for rows that satisfy a specific condition. ## Example usage {#example-usage} -In this example, we'll use a sample dataset of product sales to demonstrate how -`argMaxIf` works. We'll find the product name that has the highest price, but +In this example, we'll use a sample dataset of product sales to demonstrate how +`argMaxIf` works. We'll find the product name that has the highest price, but only for products that have been sold at least 10 times. ```sql title="Query" @@ -43,16 +43,16 @@ SELECT argMaxIf(product_name, price, sales_count >= 10) AS most_expensive_popula FROM product_sales; ``` -The `argMaxIf` function will return the product name that has the highest price -among all products that have been sold at least 10 times (sales_count >= 10). -In this case, it will return 'Laptop' since it has the highest price (999.99) +The `argMaxIf` function will return the product name that has the highest price +among all products that have been sold at least 10 times (sales_count >= 10). +In this case, it will return 'Laptop' since it has the highest price (999.99) among the popular products. ```response title="Response" ┌─most_expensi⋯lar_product─┐ 1. │ Laptop │ - └──────────────────────────┘ -``` + └──────────────────────────┘ + ``` ## See also {#see-also} - [`argMax`](/sql-reference/aggregate-functions/reference/argmax) diff --git a/docs/guides/examples/aggregate_function_combinators/argMinIf.md b/docs/guides/examples/aggregate_function_combinators/argMinIf.md index 9fd236e7ac7..78ae59ea078 100644 --- a/docs/guides/examples/aggregate_function_combinators/argMinIf.md +++ b/docs/guides/examples/aggregate_function_combinators/argMinIf.md @@ -14,8 +14,8 @@ The [`If`](/sql-reference/aggregate-functions/combinators#-if) combinator can be function to find the value of `arg` that corresponds to the minimum value of `val` for rows where the condition is true, using the `argMinIf` aggregate combinator function. -The `argMinIf` function is useful when you need to find the value associated -with the minimum value in a dataset, but only for rows that satisfy a specific +The `argMinIf` function is useful when you need to find the value associated +with the minimum value in a dataset, but only for rows that satisfy a specific condition. ## Example usage {#example-usage} @@ -52,10 +52,10 @@ but only considering rows where `in_stock = 1`. For example: - Product 2: Among in-stock rows, 20.99 has the earliest timestamp (11:00:00) ```response title="Response" - ┌─product_id─┬─lowest_price_when_in_stock─┐ -1. │ 1 │ 10.99 │ -2. │ 2 │ 20.99 │ - └────────────┴────────────────────────────┘ + ┌─product_id─┬─lowest_price_when_in_stock─┐ + 1. │ 1 │ 10.99 │ + 2. │ 2 │ 20.99 │ + └────────────┴────────────────────────────┘ ``` ## See also {#see-also} diff --git a/docs/guides/examples/aggregate_function_combinators/avgIf.md b/docs/guides/examples/aggregate_function_combinators/avgIf.md index a77130bef53..37280a1f030 100644 --- a/docs/guides/examples/aggregate_function_combinators/avgIf.md +++ b/docs/guides/examples/aggregate_function_combinators/avgIf.md @@ -45,8 +45,8 @@ In this case, it will average the amounts: 100.50, 200.75, 300.00, and 175.25. ```response title="Response" ┌─avg_successful_sale─┐ 1. │ 193.88 │ - └─────────────────────┘ -``` + └─────────────────────┘ + ``` ## See also {#see-also} - [`avg`](/sql-reference/aggregate-functions/reference/avg) diff --git a/docs/guides/examples/aggregate_function_combinators/avgMap.md b/docs/guides/examples/aggregate_function_combinators/avgMap.md index 51f73f3cf48..4bdddf944cc 100644 --- a/docs/guides/examples/aggregate_function_combinators/avgMap.md +++ b/docs/guides/examples/aggregate_function_combinators/avgMap.md @@ -11,13 +11,13 @@ sidebar_label: 'avgMap' ## Description {#description} The [`Map`](/sql-reference/aggregate-functions/combinators#-map) combinator can be applied to the [`avg`](/sql-reference/aggregate-functions/reference/avg) -function to calculate the arithmetic mean of values in a Map according to each key, using the `avgMap` +function to calculate the arithmetic mean of values in a Map according to each key, using the `avgMap` aggregate combinator function. ## Example usage {#example-usage} In this example, we'll create a table that stores status codes and their counts for different timeslots, -where each row contains a Map of status codes to their corresponding counts. We'll use +where each row contains a Map of status codes to their corresponding counts. We'll use `avgMap` to calculate the average count for each status code within each timeslot. ```sql title="Query" @@ -42,23 +42,23 @@ GROUP BY timeslot; The `avgMap` function will calculate the average count for each status code within each timeslot. For example: - In timeslot '2000-01-01 00:00:00': - - Status 'a': 15 - - Status 'b': 25 - - Status 'c': (35 + 45) / 2 = 40 - - Status 'd': 55 - - Status 'e': 65 + - Status 'a': 15 + - Status 'b': 25 + - Status 'c': (35 + 45) / 2 = 40 + - Status 'd': 55 + - Status 'e': 65 - In timeslot '2000-01-01 00:01:00': - - Status 'd': 75 - - Status 'e': 85 - - Status 'f': (95 + 105) / 2 = 100 - - Status 'g': (115 + 125) / 2 = 120 + - Status 'd': 75 + - Status 'e': 85 + - Status 'f': (95 + 105) / 2 = 100 + - Status 'g': (115 + 125) / 2 = 120 -```response title="Response" - ┌────────────timeslot─┬─avgMap(status)───────────────────────┐ -1. │ 2000-01-01 00:01:00 │ {'d':75,'e':85,'f':100,'g':120} │ -2. │ 2000-01-01 00:00:00 │ {'a':15,'b':25,'c':40,'d':55,'e':65} │ - └─────────────────────┴──────────────────────────────────────┘ -``` + ```response title="Response" + ┌────────────timeslot─┬─avgMap(status)───────────────────────┐ + 1. │ 2000-01-01 00:01:00 │ {'d':75,'e':85,'f':100,'g':120} │ + 2. │ 2000-01-01 00:00:00 │ {'a':15,'b':25,'c':40,'d':55,'e':65} │ + └─────────────────────┴──────────────────────────────────────┘ + ``` ## See also {#see-also} - [`avg`](/sql-reference/aggregate-functions/reference/avg) diff --git a/docs/guides/examples/aggregate_function_combinators/avgMerge.md b/docs/guides/examples/aggregate_function_combinators/avgMerge.md index 34a9827561f..07b3c3c5c5e 100644 --- a/docs/guides/examples/aggregate_function_combinators/avgMerge.md +++ b/docs/guides/examples/aggregate_function_combinators/avgMerge.md @@ -16,7 +16,7 @@ function to produce a final result by combining partial aggregate states. ## Example usage {#example-usage} -The `Merge` combinator is closely related to the `State` combinator. Refer to +The `Merge` combinator is closely related to the `State` combinator. Refer to ["avgState example usage"](/examples/aggregate-function-combinators/avgState/#example-usage) for an example of both `avgMerge` and `avgState`. diff --git a/docs/guides/examples/aggregate_function_combinators/avgMergeState.md b/docs/guides/examples/aggregate_function_combinators/avgMergeState.md index 916e21fb12a..666fd33d9de 100644 --- a/docs/guides/examples/aggregate_function_combinators/avgMergeState.md +++ b/docs/guides/examples/aggregate_function_combinators/avgMergeState.md @@ -20,10 +20,10 @@ return a new intermediate aggregation state. ## Example usage {#example-usage} -The `MergeState` combinator is particularly useful for multi-level aggregation -scenarios where you want to combine pre-aggregated states and maintain them as +The `MergeState` combinator is particularly useful for multi-level aggregation +scenarios where you want to combine pre-aggregated states and maintain them as states (rather than finalizing them) for further processing. To illustrate, we'll -look at an example in which we transform individual server performance metrics +look at an example in which we transform individual server performance metrics into hierarchical aggregations across multiple levels: Server level → Region level → Datacenter level. @@ -123,12 +123,12 @@ INSERT INTO raw_server_metrics (timestamp, server_id, region, datacenter, respon We'll write three queries for each of the levels: - + ```sql SELECT - server_id, - region, - avgMerge(avg_response_time) AS avg_response_ms +server_id, +region, +avgMerge(avg_response_time) AS avg_response_ms FROM server_performance GROUP BY server_id, region ORDER BY region, server_id; @@ -143,13 +143,13 @@ ORDER BY region, server_id; │ 202 │ us-west │ 105 │ └───────────┴────────────┴─────────────────┘ ``` - - + + ```sql SELECT - region, - datacenter, - avgMerge(avg_response_time) AS avg_response_ms +region, +datacenter, +avgMerge(avg_response_time) AS avg_response_ms FROM region_performance GROUP BY region, datacenter ORDER BY datacenter, region; @@ -161,12 +161,12 @@ ORDER BY datacenter, region; │ eu-central │ dc2 │ 150 │ └────────────┴────────────┴────────────────────┘ ``` - - + + ```sql SELECT - datacenter, - avgMerge(avg_response_time) AS avg_response_ms +datacenter, +avgMerge(avg_response_time) AS avg_response_ms FROM datacenter_performance GROUP BY datacenter ORDER BY datacenter; @@ -177,7 +177,7 @@ ORDER BY datacenter; │ dc2 │ 150 │ └────────────┴─────────────────┘ ``` - + We can insert more data: @@ -189,7 +189,7 @@ INSERT INTO raw_server_metrics (timestamp, server_id, region, datacenter, respon (now(), 301, 'eu-central', 'dc2', 135); ``` -Let's check the datacenter-level performance again. Notice how the entire +Let's check the datacenter-level performance again. Notice how the entire aggregation chain updated automatically: ```sql diff --git a/docs/guides/examples/aggregate_function_combinators/avgResample.md b/docs/guides/examples/aggregate_function_combinators/avgResample.md index bdbeb9f91d5..e7743937af3 100644 --- a/docs/guides/examples/aggregate_function_combinators/avgResample.md +++ b/docs/guides/examples/aggregate_function_combinators/avgResample.md @@ -10,7 +10,7 @@ sidebar_label: 'avgResample' ## Description {#description} -The [`Resample`](/sql-reference/aggregate-functions/combinators#-resample) +The [`Resample`](/sql-reference/aggregate-functions/combinators#-resample) combinator can be applied to the [`count`](/sql-reference/aggregate-functions/reference/count) aggregate function to count values of a specified key column in a fixed number of intervals (`N`). @@ -23,12 +23,12 @@ Let's look at an example. We'll create a table which contains the `name`, `age` `wage` of employees, and we'll insert some data into it: ```sql -CREATE TABLE employee_data +CREATE TABLE employee_data ( name String, age UInt8, wage Float32 -) +) ENGINE = MergeTree() ORDER BY tuple() @@ -41,9 +41,9 @@ INSERT INTO employee_data (name, age, wage) VALUES ('Brian', 60, 16.0); ``` -Let's get the average wage of the people whose age lies in the intervals of `[30,60)` -and `[60,75)` (`[` is exclusive and `)` is inclusive). Since we use integer -representation for age, we get ages in the intervals `[30, 59]` and `[60,74]`. +Let's get the average wage of the people whose age lies in the intervals of `[30,60)` +and `[60,75)` (`[` is exclusive and `)` is inclusive). Since we use integer +representation for age, we get ages in the intervals `[30, 59]` and `[60,74]`. To do so we apply the `Resample` combinator to the `avg` aggregate function. ```sql diff --git a/docs/guides/examples/aggregate_function_combinators/avgState.md b/docs/guides/examples/aggregate_function_combinators/avgState.md index e0e2317701d..6674db8721f 100644 --- a/docs/guides/examples/aggregate_function_combinators/avgState.md +++ b/docs/guides/examples/aggregate_function_combinators/avgState.md @@ -10,14 +10,14 @@ sidebar_label: 'avgState' ## Description {#description} -The [`State`](/sql-reference/aggregate-functions/combinators#-state) combinator -can be applied to the [`avg`](/sql-reference/aggregate-functions/reference/avg) +The [`State`](/sql-reference/aggregate-functions/combinators#-state) combinator +can be applied to the [`avg`](/sql-reference/aggregate-functions/reference/avg) function to produce an intermediate state of `AggregateFunction(avg, T)` type where `T` is the specified type for the average. ## Example usage {#example-usage} -In this example, we'll look at how we can use the `AggregateFunction` type, +In this example, we'll look at how we can use the `AggregateFunction` type, together with the `avgState` function to aggregate website traffic data. First create the source table for website traffic data: @@ -34,8 +34,8 @@ ENGINE = MergeTree() ORDER BY (page_id, viewed_at); ``` -Create the aggregate table that will store average response times. Note that -`avg` cannot use the `SimpleAggregateFunction` type as it requires a complex +Create the aggregate table that will store average response times. Note that +`avg` cannot use the `SimpleAggregateFunction` type as it requires a complex state (a sum and a count). We therefore use the `AggregateFunction` type: ```sql @@ -49,7 +49,7 @@ ENGINE = AggregatingMergeTree() ORDER BY page_id; ``` -Create an Incremental materialized view that will act as an insert trigger to +Create an Incremental materialized view that will act as an insert trigger to new data and store the intermediate state data in the target table defined above: ```sql @@ -89,7 +89,7 @@ INSERT INTO raw_page_views (page_id, page_name, response_time_ms) VALUES Examine the target table `page_performance`: ```sql -SELECT +SELECT page_id, page_name, avg_response_time, @@ -110,12 +110,12 @@ FROM page_performance ``` Notice that the `avg_response_time` column is of type `AggregateFunction(avg, UInt32)` -and stores intermediate state information. Also notice that the row data for the -`avg_response_time` is not useful to us and we see strange text characters such -as `�, n, F, }`. This is the terminals attempt to display binary data as text. -The reason for this is that `AggregateFunction` types store their state in a -binary format that's optimized for efficient storage and computation, not for -human readability. This binary state contains all the information needed to +and stores intermediate state information. Also notice that the row data for the +`avg_response_time` is not useful to us and we see strange text characters such +as `�, n, F, }`. This is the terminals attempt to display binary data as text. +The reason for this is that `AggregateFunction` types store their state in a +binary format that's optimized for efficient storage and computation, not for +human readability. This binary state contains all the information needed to calculate the average. To make use of it, use the `Merge` combinator: diff --git a/docs/guides/examples/aggregate_function_combinators/countIf.md b/docs/guides/examples/aggregate_function_combinators/countIf.md index 53aac092dd4..8535cd03eed 100644 --- a/docs/guides/examples/aggregate_function_combinators/countIf.md +++ b/docs/guides/examples/aggregate_function_combinators/countIf.md @@ -47,8 +47,8 @@ The `countIf` function will count only the rows where `is_successful = 1` for ea ┌─user_id─┬─successful_logins─┐ 1. │ 1 │ 2 │ 2. │ 2 │ 2 │ - └─────────┴───────────────────┘ -``` + └─────────┴───────────────────┘ + ``` ## See also {#see-also} - [`count`](/sql-reference/aggregate-functions/reference/count) diff --git a/docs/guides/examples/aggregate_function_combinators/countResample.md b/docs/guides/examples/aggregate_function_combinators/countResample.md index f90bb6a168c..c5f38823398 100644 --- a/docs/guides/examples/aggregate_function_combinators/countResample.md +++ b/docs/guides/examples/aggregate_function_combinators/countResample.md @@ -10,7 +10,7 @@ sidebar_label: 'countResample' ## Description {#description} -The [`Resample`](/sql-reference/aggregate-functions/combinators#-resample) +The [`Resample`](/sql-reference/aggregate-functions/combinators#-resample) combinator can be applied to the [`count`](/sql-reference/aggregate-functions/reference/count) aggregate function to count values of a specified key column in a fixed number of intervals (`N`). @@ -23,12 +23,12 @@ Let's look at an example. We'll create a table which contains the `name`, `age` `wage` of employees, and we'll insert some data into it: ```sql -CREATE TABLE employee_data +CREATE TABLE employee_data ( name String, age UInt8, wage Float32 -) +) ENGINE = MergeTree() ORDER BY tuple() @@ -41,9 +41,9 @@ INSERT INTO employee_data (name, age, wage) VALUES ('Brian', 60, 16.0); ``` -Let's count all the people whose age lies in the intervals of `[30,60)` +Let's count all the people whose age lies in the intervals of `[30,60)` and `[60,75)`. Since we use integer representation for age, we get ages in the -`[30, 59]` and `[60,74]` intervals. To do so we apply the `Resample` combinator +`[30, 59]` and `[60,74]` intervals. To do so we apply the `Resample` combinator to `count` ```sql diff --git a/docs/guides/examples/aggregate_function_combinators/groupArrayDistinct.md b/docs/guides/examples/aggregate_function_combinators/groupArrayDistinct.md index dd7350258fc..3e5d6aaf17e 100644 --- a/docs/guides/examples/aggregate_function_combinators/groupArrayDistinct.md +++ b/docs/guides/examples/aggregate_function_combinators/groupArrayDistinct.md @@ -20,7 +20,7 @@ For this example we'll make use of the `hits` dataset available in our [SQL play Imagine you want to find out, for each distinct landing page domain (`URLDomain`) on your website, what are all the unique User Agent OS codes (`OS`) recorded for -visitors landing on that domain. This could help you understand the variety of +visitors landing on that domain. This could help you understand the variety of operating systems interacting with different parts of your site. ```sql runnable diff --git a/docs/guides/examples/aggregate_function_combinators/groupArrayResample.md b/docs/guides/examples/aggregate_function_combinators/groupArrayResample.md index 38176eaa49f..aa5beeb4c2c 100644 --- a/docs/guides/examples/aggregate_function_combinators/groupArrayResample.md +++ b/docs/guides/examples/aggregate_function_combinators/groupArrayResample.md @@ -10,10 +10,10 @@ sidebar_label: 'groupArrayResample' ## Description {#description} -The [`Resample`](/sql-reference/aggregate-functions/combinators#-resample) +The [`Resample`](/sql-reference/aggregate-functions/combinators#-resample) combinator can be applied to the [`groupArray`](/sql-reference/aggregate-functions/reference/sum) aggregate function to -divide the range of a specified key column into a fixed number of intervals (`N`) -and construct the resulting array by selecting one representative value +divide the range of a specified key column into a fixed number of intervals (`N`) +and construct the resulting array by selecting one representative value (corresponding to the minimum key) from the data points falling into each interval. It creates a downsampled view of the data rather than collecting all values. @@ -23,7 +23,7 @@ Let's look at an example. We'll create a table which contains the `name`, `age` `wage` of employees, and we'll insert some data into it: ```sql -CREATE TABLE employee_data +CREATE TABLE employee_data ( name String, age UInt8, @@ -40,13 +40,13 @@ INSERT INTO employee_data (name, age, wage) VALUES ('Brian', 60, 16.0); ``` -Let's get the names of the people whose age lies in the intervals of `[30,60)` +Let's get the names of the people whose age lies in the intervals of `[30,60)` and `[60,75)`. Since we use integer representation for age, we get ages in the `[30, 59]` and `[60,74]` intervals. -To aggregate names in an array, we use the `groupArray` aggregate function. +To aggregate names in an array, we use the `groupArray` aggregate function. It takes one argument. In our case, it's the name column. The `groupArrayResample` -function should use the age column to aggregate names by age. To define the +function should use the age column to aggregate names by age. To define the required intervals, we pass `30`, `75`, `30` as arguments into the `groupArrayResample` function: diff --git a/docs/guides/examples/aggregate_function_combinators/maxMap.md b/docs/guides/examples/aggregate_function_combinators/maxMap.md index e1ffe4907fb..08724a5afd3 100644 --- a/docs/guides/examples/aggregate_function_combinators/maxMap.md +++ b/docs/guides/examples/aggregate_function_combinators/maxMap.md @@ -11,13 +11,13 @@ sidebar_label: 'maxMap' ## Description {#description} The [`Map`](/sql-reference/aggregate-functions/combinators#-map) combinator can be applied to the [`max`](/sql-reference/aggregate-functions/reference/max) -function to calculate the maximum value in a Map according to each key, using the `maxMap` +function to calculate the maximum value in a Map according to each key, using the `maxMap` aggregate combinator function. ## Example usage {#example-usage} In this example, we'll create a table that stores status codes and their counts for different timeslots, -where each row contains a Map of status codes to their corresponding counts. We'll use +where each row contains a Map of status codes to their corresponding counts. We'll use `maxMap` to find the maximum count for each status code within each timeslot. ```sql title="Query" @@ -42,22 +42,22 @@ GROUP BY timeslot; The `maxMap` function will find the maximum count for each status code within each timeslot. For example: - In timeslot '2000-01-01 00:00:00': - - Status 'a': 15 - - Status 'b': 25 - - Status 'c': max(35, 45) = 45 - - Status 'd': 55 - - Status 'e': 65 + - Status 'a': 15 + - Status 'b': 25 + - Status 'c': max(35, 45) = 45 + - Status 'd': 55 + - Status 'e': 65 - In timeslot '2000-01-01 00:01:00': - - Status 'd': 75 - - Status 'e': 85 - - Status 'f': max(95, 105) = 105 - - Status 'g': max(115, 125) = 125 + - Status 'd': 75 + - Status 'e': 85 + - Status 'f': max(95, 105) = 105 + - Status 'g': max(115, 125) = 125 ```response title="Response" - ┌────────────timeslot─┬─maxMap(status)───────────────────────┐ -1. │ 2000-01-01 00:01:00 │ {'d':75,'e':85,'f':105,'g':125} │ -2. │ 2000-01-01 00:00:00 │ {'a':15,'b':25,'c':45,'d':55,'e':65} │ - └─────────────────────┴──────────────────────────────────────┘ + ┌────────────timeslot─┬─maxMap(status)───────────────────────┐ + 1. │ 2000-01-01 00:01:00 │ {'d':75,'e':85,'f':105,'g':125} │ + 2. │ 2000-01-01 00:00:00 │ {'a':15,'b':25,'c':45,'d':55,'e':65} │ + └─────────────────────┴──────────────────────────────────────┘ ``` ## See also {#see-also} diff --git a/docs/guides/examples/aggregate_function_combinators/minMap.md b/docs/guides/examples/aggregate_function_combinators/minMap.md index e8843244f64..494f6c7ae10 100644 --- a/docs/guides/examples/aggregate_function_combinators/minMap.md +++ b/docs/guides/examples/aggregate_function_combinators/minMap.md @@ -11,13 +11,13 @@ sidebar_label: 'minMap' ## Description {#description} The [`Map`](/sql-reference/aggregate-functions/combinators#-map) combinator can be applied to the [`min`](/sql-reference/aggregate-functions/reference/min) -function to calculate the minimum value in a Map according to each key, using the `minMap` +function to calculate the minimum value in a Map according to each key, using the `minMap` aggregate combinator function. ## Example usage {#example-usage} In this example, we'll create a table that stores status codes and their counts for different timeslots, -where each row contains a Map of status codes to their corresponding counts. We'll use +where each row contains a Map of status codes to their corresponding counts. We'll use `minMap` to find the minimum count for each status code within each timeslot. ```sql title="Query" @@ -42,22 +42,22 @@ GROUP BY timeslot; The `minMap` function will find the minimum count for each status code within each timeslot. For example: - In timeslot '2000-01-01 00:00:00': - - Status 'a': 15 - - Status 'b': 25 - - Status 'c': min(35, 45) = 35 - - Status 'd': 55 - - Status 'e': 65 + - Status 'a': 15 + - Status 'b': 25 + - Status 'c': min(35, 45) = 35 + - Status 'd': 55 + - Status 'e': 65 - In timeslot '2000-01-01 00:01:00': - - Status 'd': 75 - - Status 'e': 85 - - Status 'f': min(95, 105) = 95 - - Status 'g': min(115, 125) = 115 + - Status 'd': 75 + - Status 'e': 85 + - Status 'f': min(95, 105) = 95 + - Status 'g': min(115, 125) = 115 ```response title="Response" - ┌────────────timeslot─┬─minMap(status)───────────────────────┐ -1. │ 2000-01-01 00:01:00 │ {'d':75,'e':85,'f':95,'g':115} │ -2. │ 2000-01-01 00:00:00 │ {'a':15,'b':25,'c':35,'d':55,'e':65} │ - └─────────────────────┴──────────────────────────────────────┘ + ┌────────────timeslot─┬─minMap(status)───────────────────────┐ + 1. │ 2000-01-01 00:01:00 │ {'d':75,'e':85,'f':95,'g':115} │ + 2. │ 2000-01-01 00:00:00 │ {'a':15,'b':25,'c':35,'d':55,'e':65} │ + └─────────────────────┴──────────────────────────────────────┘ ``` ## See also {#see-also} diff --git a/docs/guides/examples/aggregate_function_combinators/minSimpleState.md b/docs/guides/examples/aggregate_function_combinators/minSimpleState.md index 0d7fa86a4ac..aac0b12f460 100644 --- a/docs/guides/examples/aggregate_function_combinators/minSimpleState.md +++ b/docs/guides/examples/aggregate_function_combinators/minSimpleState.md @@ -11,14 +11,14 @@ sidebar_label: 'minSimpleState' ## Description {#description} The [`SimpleState`](/sql-reference/aggregate-functions/combinators#-simplestate) combinator can be applied to the [`min`](/sql-reference/aggregate-functions/reference/min) -function to return the minimum value across all input values. It returns the +function to return the minimum value across all input values. It returns the result with type [`SimpleAggregateFunction`](/docs/sql-reference/data-types/simpleaggregatefunction). ## Example usage {#example-usage} -Let's look at a practical example using a table that tracks daily temperature +Let's look at a practical example using a table that tracks daily temperature readings. For each location, we want to maintain the lowest temperature recorded. -Using the `SimpleAggregateFunction` type with `min` automatically updates the +Using the `SimpleAggregateFunction` type with `min` automatically updates the stored value when a lower temperature is encountered. Create the source table for raw temperature readings: @@ -113,7 +113,7 @@ View the updated extremes after new data: SELECT location_id, location_name, - min_temp, + min_temp, max_temp FROM temperature_extremes ORDER BY location_id; @@ -140,7 +140,7 @@ the final result from the partial states we need to add a `GROUP BY`: SELECT location_id, location_name, - min(min_temp) AS min_temp, -- Aggregate across all parts + min(min_temp) AS min_temp, -- Aggregate across all parts max(max_temp) AS max_temp -- Aggregate across all parts FROM temperature_extremes GROUP BY location_id, location_name diff --git a/docs/guides/examples/aggregate_function_combinators/quantilesTimingArrayIf.md b/docs/guides/examples/aggregate_function_combinators/quantilesTimingArrayIf.md index 3df6548f982..081f49df1ff 100644 --- a/docs/guides/examples/aggregate_function_combinators/quantilesTimingArrayIf.md +++ b/docs/guides/examples/aggregate_function_combinators/quantilesTimingArrayIf.md @@ -10,7 +10,7 @@ sidebar_label: 'quantilesTimingArrayIf' ## Description {#description} -The [`Array`](/sql-reference/aggregate-functions/combinators#-array) and [`If`](/sql-reference/aggregate-functions/combinators#-if) +The [`Array`](/sql-reference/aggregate-functions/combinators#-array) and [`If`](/sql-reference/aggregate-functions/combinators#-if) combinator can be applied to the [`quantilesTiming`](/sql-reference/aggregate-functions/reference/quantiletiming) function to calculate quantiles of timing values in arrays for rows where the condition is true, using the `quantilesTimingArrayIf` aggregate combinator function. @@ -50,11 +50,11 @@ The returned array contains the following quantiles in order: - 1.0 (maximum) ```response title="Response" - ┌─endpoint─┬─response_time_quantiles─────────────────────────────────────────────┐ -1. │ orders │ [82, 87, 92, 98, 103, 104, 105] │ -2. │ products │ [45, 47, 49, 51, 52, 52, 53] │ -3. │ users │ [nan, nan, nan, nan, nan, nan, nan] │ - └──────────┴─────────────────────────────────────────────────────────────────────┘ + ┌─endpoint─┬─response_time_quantiles─────────────────────────────────────────────┐ + 1. │ orders │ [82, 87, 92, 98, 103, 104, 105] │ + 2. │ products │ [45, 47, 49, 51, 52, 52, 53] │ + 3. │ users │ [nan, nan, nan, nan, nan, nan, nan] │ + └──────────┴─────────────────────────────────────────────────────────────────────┘ ``` ## See also {#see-also} diff --git a/docs/guides/examples/aggregate_function_combinators/quantilesTimingIf.md b/docs/guides/examples/aggregate_function_combinators/quantilesTimingIf.md index 4dc1ae5743b..aecef4ce6cf 100644 --- a/docs/guides/examples/aggregate_function_combinators/quantilesTimingIf.md +++ b/docs/guides/examples/aggregate_function_combinators/quantilesTimingIf.md @@ -70,11 +70,11 @@ The returned array contains the following quantiles in order: - 1.0 (maximum) ```response title="Response" - ┌─endpoint─┬─response_time_quantiles─────────────────────────────────────────────┐ -1. │ orders │ [82, 87, 92, 98, 103, 104, 105] │ -2. │ products │ [45, 47, 49, 51, 52, 52, 53] │ -3. │ users │ [nan, nan, nan, nan, nan, nan, nan] │ - └──────────┴─────────────────────────────────────────────────────────────────────┘ + ┌─endpoint─┬─response_time_quantiles─────────────────────────────────────────────┐ + 1. │ orders │ [82, 87, 92, 98, 103, 104, 105] │ + 2. │ products │ [45, 47, 49, 51, 52, 52, 53] │ + 3. │ users │ [nan, nan, nan, nan, nan, nan, nan] │ + └──────────┴─────────────────────────────────────────────────────────────────────┘ ``` ## See also {#see-also} diff --git a/docs/guides/examples/aggregate_function_combinators/sumArray.md b/docs/guides/examples/aggregate_function_combinators/sumArray.md index 0f1fdabf461..7e44584b93c 100644 --- a/docs/guides/examples/aggregate_function_combinators/sumArray.md +++ b/docs/guides/examples/aggregate_function_combinators/sumArray.md @@ -10,17 +10,17 @@ sidebar_label: 'sumArray' ## Description {#description} -The [`Array`](/sql-reference/aggregate-functions/combinators#-array) combinator +The [`Array`](/sql-reference/aggregate-functions/combinators#-array) combinator can be applied to the [`sum`](/sql-reference/aggregate-functions/reference/sum) -function to calculate the sum of all elements in an array, using the `sumArray` +function to calculate the sum of all elements in an array, using the `sumArray` aggregate combinator function. -The `sumArray` function is useful when you need to calculate the total sum of +The `sumArray` function is useful when you need to calculate the total sum of all elements across multiple arrays in a dataset. ## Example usage {#example-usage} -In this example, we'll use a sample dataset of daily sales across different +In this example, we'll use a sample dataset of daily sales across different product categories to demonstrate how `sumArray` works. We'll calculate the total sales across all categories for each day. @@ -36,7 +36,7 @@ INSERT INTO daily_category_sales VALUES ('2024-01-02', [120, 180, 160]), ('2024-01-03', [90, 220, 140]); -SELECT +SELECT date, category_sales, sumArray(category_sales) AS total_sales_sumArray, @@ -45,8 +45,8 @@ FROM daily_category_sales GROUP BY date, category_sales; ``` -The `sumArray` function will sum up all elements in each `category_sales` array. -For example, on `2024-01-01`, it sums `100 + 200 + 150 = 450`. This gives the +The `sumArray` function will sum up all elements in each `category_sales` array. +For example, on `2024-01-01`, it sums `100 + 200 + 150 = 450`. This gives the same result as `arraySum`. ## See also {#see-also} diff --git a/docs/guides/examples/aggregate_function_combinators/sumForEach.md b/docs/guides/examples/aggregate_function_combinators/sumForEach.md index 8184ccf01f2..4a47cb0d66a 100644 --- a/docs/guides/examples/aggregate_function_combinators/sumForEach.md +++ b/docs/guides/examples/aggregate_function_combinators/sumForEach.md @@ -19,15 +19,15 @@ array columns, applying the aggregate to each element in the array across rows. For this example we'll make use of the `hits` dataset available in our [SQL playground](https://sql.clickhouse.com/). -The `hits` table contains a column called `isMobile` of type UInt8 which can be +The `hits` table contains a column called `isMobile` of type UInt8 which can be `0` for Desktop or `1` for mobile: ```sql runnable SELECT EventTime, IsMobile FROM metrica.hits ORDER BY rand() LIMIT 10 ``` -We'll use the `sumForEach` aggregate combinator function to analyze how -desktop versus mobile traffic varies by hour of the day. Click the play button +We'll use the `sumForEach` aggregate combinator function to analyze how +desktop versus mobile traffic varies by hour of the day. Click the play button below to run the query interactively: ```sql runnable diff --git a/docs/guides/examples/aggregate_function_combinators/sumIf.md b/docs/guides/examples/aggregate_function_combinators/sumIf.md index 7b70e7117ea..c948e4bdf5b 100644 --- a/docs/guides/examples/aggregate_function_combinators/sumIf.md +++ b/docs/guides/examples/aggregate_function_combinators/sumIf.md @@ -45,8 +45,8 @@ In this case, it will sum: 100.50 + 200.75 + 300.00 + 175.25. ```response title="Response" ┌─total_successful_sales─┐ 1. │ 776.50 │ - └───────────────────────┘ -``` + └───────────────────────┘ + ``` ### Calculate trading volume by price direction {#calculate-trading-vol-price-direction} @@ -54,7 +54,7 @@ In this example we'll use the `stock` table available at [ClickHouse playground] to calculate trading volume by price direction in the first half of the year 2002. ```sql title="Query" -SELECT +SELECT toStartOfMonth(date) AS month, formatReadableQuantity(sumIf(volume, price > open)) AS volume_on_up_days, formatReadableQuantity(sumIf(volume, price < open)) AS volume_on_down_days, @@ -81,7 +81,7 @@ ORDER BY month; 11. │ 2002-11-01 │ 34.90 billion │ 25.47 billion │ 998.34 million │ 61.37 billion │ 12. │ 2002-12-01 │ 22.99 billion │ 28.65 billion │ 1.14 billion │ 52.79 billion │ └────────────┴───────────────────┴─────────────────────┴────────────────────────┴───────────────┘ -``` + ``` ### Calculate trading volume by stock symbol {#calculate-trading-volume} @@ -90,7 +90,7 @@ to calculate trading volume by stock symbol in 2006 for three of the largest tec companies at the time. ```sql title="Query" -SELECT +SELECT toStartOfMonth(date) AS month, formatReadableQuantity(sumIf(volume, symbol = 'AAPL')) AS apple_volume, formatReadableQuantity(sumIf(volume, symbol = 'MSFT')) AS microsoft_volume, @@ -118,7 +118,7 @@ ORDER BY month; 11. │ 2006-11-01 │ 494.37 million │ 1.24 billion │ 118.49 million │ 90177365500 │ 2.06 │ 12. │ 2006-12-01 │ 603.95 million │ 1.14 billion │ 91.77 million │ 80499584100 │ 2.28 │ └────────────┴────────────────┴──────────────────┴────────────────┴──────────────┴───────────────────────┘ -``` + ``` ## See also {#see-also} - [`sum`](/sql-reference/aggregate-functions/reference/sum) diff --git a/docs/guides/examples/aggregate_function_combinators/sumMap.md b/docs/guides/examples/aggregate_function_combinators/sumMap.md index fda6b895388..5d9b59acd80 100644 --- a/docs/guides/examples/aggregate_function_combinators/sumMap.md +++ b/docs/guides/examples/aggregate_function_combinators/sumMap.md @@ -11,13 +11,13 @@ sidebar_label: 'sumMap' ## Description {#description} The [`Map`](/sql-reference/aggregate-functions/combinators#-map) combinator can be applied to the [`sum`](/sql-reference/aggregate-functions/reference/sum) -function to calculate the sum of values in a Map according to each key, using the `sumMap` +function to calculate the sum of values in a Map according to each key, using the `sumMap` aggregate combinator function. ## Example usage {#example-usage} In this example, we'll create a table that stores status codes and their counts for different timeslots, -where each row contains a Map of status codes to their corresponding counts. We'll use +where each row contains a Map of status codes to their corresponding counts. We'll use `sumMap` to calculate the total count for each status code within each timeslot. ```sql title="Query" @@ -42,23 +42,23 @@ GROUP BY timeslot; The `sumMap` function will calculate the total count for each status code within each timeslot. For example: - In timeslot '2000-01-01 00:00:00': - - Status 'a': 15 - - Status 'b': 25 - - Status 'c': 35 + 45 = 80 - - Status 'd': 55 - - Status 'e': 65 + - Status 'a': 15 + - Status 'b': 25 + - Status 'c': 35 + 45 = 80 + - Status 'd': 55 + - Status 'e': 65 - In timeslot '2000-01-01 00:01:00': - - Status 'd': 75 - - Status 'e': 85 - - Status 'f': 95 + 105 = 200 - - Status 'g': 115 + 125 = 240 + - Status 'd': 75 + - Status 'e': 85 + - Status 'f': 95 + 105 = 200 + - Status 'g': 115 + 125 = 240 -```response title="Response" - ┌────────────timeslot─┬─sumMap(status)───────────────────────┐ -1. │ 2000-01-01 00:01:00 │ {'d':75,'e':85,'f':200,'g':240} │ -2. │ 2000-01-01 00:00:00 │ {'a':15,'b':25,'c':80,'d':55,'e':65} │ - └─────────────────────┴──────────────────────────────────────┘ -``` + ```response title="Response" + ┌────────────timeslot─┬─sumMap(status)───────────────────────┐ + 1. │ 2000-01-01 00:01:00 │ {'d':75,'e':85,'f':200,'g':240} │ + 2. │ 2000-01-01 00:00:00 │ {'a':15,'b':25,'c':80,'d':55,'e':65} │ + └─────────────────────┴──────────────────────────────────────┘ + ``` ## See also {#see-also} - [`sum`](/sql-reference/aggregate-functions/reference/sum) diff --git a/docs/guides/examples/aggregate_function_combinators/sumSimpleState.md b/docs/guides/examples/aggregate_function_combinators/sumSimpleState.md index 35b8759ea35..fa14a75af0c 100644 --- a/docs/guides/examples/aggregate_function_combinators/sumSimpleState.md +++ b/docs/guides/examples/aggregate_function_combinators/sumSimpleState.md @@ -11,18 +11,18 @@ sidebar_label: 'sumSimpleState' ## Description {#description} The [`SimpleState`](/sql-reference/aggregate-functions/combinators#-simplestate) combinator can be applied to the [`sum`](/sql-reference/aggregate-functions/reference/sum) -function to return the sum across all input values. It returns the result with +function to return the sum across all input values. It returns the result with type [`SimpleAggregateFunction`](/docs/sql-reference/data-types/simpleaggregatefunction). ## Example usage {#example-usage} ### Tracking upvotes and downvotes {#tracking-post-votes} -Let's look at a practical example using a table that tracks votes on posts. -For each post, we want to maintain running totals of upvotes, downvotes, and an +Let's look at a practical example using a table that tracks votes on posts. +For each post, we want to maintain running totals of upvotes, downvotes, and an overall score. Using the `SimpleAggregateFunction` type with sum is suited for -this use case as we only need to store the running totals, not the entire state -of the aggregation. As a result, it will be faster and will not require merging +this use case as we only need to store the running totals, not the entire state +of the aggregation. As a result, it will be faster and will not require merging of partial aggregate states. First, we create a table for the raw data: @@ -52,7 +52,7 @@ ORDER BY post_id; ``` We then create a materialized view with `SimpleAggregateFunction` type columns: - + ```sql CREATE MATERIALIZED VIEW mv_vote_processor TO vote_aggregates AS @@ -68,7 +68,7 @@ FROM raw_votes; ``` Insert sample data: - + ```sql INSERT INTO raw_votes VALUES (1, 'upvote'), diff --git a/docs/guides/examples/aggregate_function_combinators/uniqArray.md b/docs/guides/examples/aggregate_function_combinators/uniqArray.md index 07c651fb298..93c3ebed4d6 100644 --- a/docs/guides/examples/aggregate_function_combinators/uniqArray.md +++ b/docs/guides/examples/aggregate_function_combinators/uniqArray.md @@ -10,19 +10,19 @@ sidebar_label: 'uniqArray' ## Description {#description} -The [`Array`](/sql-reference/aggregate-functions/combinators#-array) combinator +The [`Array`](/sql-reference/aggregate-functions/combinators#-array) combinator can be applied to the [`uniq`](/sql-reference/aggregate-functions/reference/uniq) -function to calculate the approximate number of unique elements across all arrays, +function to calculate the approximate number of unique elements across all arrays, using the `uniqArray` aggregate combinator function. -The `uniqArray` function is useful when you need to count unique elements across -multiple arrays in a dataset. It's equivalent to using `uniq(arrayJoin())`, where +The `uniqArray` function is useful when you need to count unique elements across +multiple arrays in a dataset. It's equivalent to using `uniq(arrayJoin())`, where `arrayJoin` first flattens the arrays and then `uniq` counts the unique elements. ## Example usage {#example-usage} -In this example, we'll use a sample dataset of user interests across different -categories to demonstrate how `uniqArray` works. We'll compare it with +In this example, we'll use a sample dataset of user interests across different +categories to demonstrate how `uniqArray` works. We'll compare it with `uniq(arrayJoin())` to show the difference in counting unique elements. ```sql title="Query" @@ -37,22 +37,22 @@ INSERT INTO user_interests VALUES (2, ['gaming', 'sports', 'music']), (3, ['reading', 'cooking']); -SELECT +SELECT uniqArray(interests) AS unique_interests_total, uniq(arrayJoin(interests)) AS unique_interests_arrayJoin FROM user_interests; ``` -The `uniqArray` function counts unique elements across all arrays combined, similar to `uniq(arrayJoin())`. +The `uniqArray` function counts unique elements across all arrays combined, similar to `uniq(arrayJoin())`. In this example: - `uniqArray` returns 5 because there are 5 unique interests across all users: 'reading', 'gaming', 'music', 'sports', 'cooking' - `uniq(arrayJoin())` also returns 5, showing that both functions count unique elements across all arrays -```response title="Response" - ┌─unique_interests_total─┬─unique_interests_arrayJoin─┐ -1. │ 5 │ 5 │ - └────────────────────────┴────────────────────────────┘ -``` + ```response title="Response" + ┌─unique_interests_total─┬─unique_interests_arrayJoin─┐ + 1. │ 5 │ 5 │ + └────────────────────────┴────────────────────────────┘ + ``` ## See also {#see-also} - [`uniq`](/sql-reference/aggregate-functions/reference/uniq) diff --git a/docs/guides/examples/aggregate_function_combinators/uniqArrayIf.md b/docs/guides/examples/aggregate_function_combinators/uniqArrayIf.md index 31470be7e31..f4b7bff83b4 100644 --- a/docs/guides/examples/aggregate_function_combinators/uniqArrayIf.md +++ b/docs/guides/examples/aggregate_function_combinators/uniqArrayIf.md @@ -11,22 +11,22 @@ sidebar_label: 'uniqArrayIf' ## Description {#description} The [`Array`](/sql-reference/aggregate-functions/combinators#-array) and [`If`](/sql-reference/aggregate-functions/combinators#-if) combinators can be applied to the [`uniq`](/sql-reference/aggregate-functions/reference/uniq) -function to count the number of unique values in arrays for rows where the +function to count the number of unique values in arrays for rows where the condition is true, using the `uniqArrayIf` aggregate combinator function. :::note -`If` and -`Array` can be combined. However, `Array` must come first, then `If`. ::: -This is useful when you want to count unique elements in an array based on +This is useful when you want to count unique elements in an array based on specific conditions without having to use `arrayJoin`. ## Example usage {#example-usage} ### Count unique products viewed by segment type and engagement level {#count-unique-products} -In this example, we'll use a table with user shopping session data to count the -number of unique products viewed by users of a specific user segment and with +In this example, we'll use a table with user shopping session data to count the +number of unique products viewed by users of a specific user segment and with an engagement metric of time spent in the session. ```sql title="Query" @@ -47,7 +47,7 @@ INSERT INTO user_shopping_sessions VALUES ('2024-01-02', 'premium', ['smartphone_x', 'smartwatch_b', 'headphones_y'], 22); -- Count unique products viewed by segment type and engagement level -SELECT +SELECT session_date, -- Count unique products viewed in long sessions by new customers uniqArrayIf(viewed_products, user_segment = 'new_customer' AND session_duration_minutes > 10) AS new_customer_engaged_products, diff --git a/docs/guides/inserting-data.md b/docs/guides/inserting-data.md index 4e26d2c13ff..3ab7c29ea74 100644 --- a/docs/guides/inserting-data.md +++ b/docs/guides/inserting-data.md @@ -72,8 +72,8 @@ This means inserts remain resilient in the following cases: - 1. If the node receiving the data has issues, the insert query will time out (or give a more specific error) and not get an acknowledgment. - 2. If the data got written by the node but the acknowledgement can't be returned to the sender of the query because of network interruptions, the sender will either get a time-out or a network error. -From the client's perspective, (i) and (ii) can be hard to distinguish. However, in both cases, the unacknowledged insert can just be immediately retried. -As long as the retried insert query contains the same data in the same order, ClickHouse will automatically ignore the retried insert if the (unacknowledged) original insert succeeded. + From the client's perspective, (i) and (ii) can be hard to distinguish. However, in both cases, the unacknowledged insert can just be immediately retried. + As long as the retried insert query contains the same data in the same order, ClickHouse will automatically ignore the retried insert if the (unacknowledged) original insert succeeded. ### Insert to a MergeTree table or a distributed table {#insert-to-a-mergetree-table-or-a-distributed-table} @@ -110,7 +110,6 @@ Note that the data is not searchable by queries before being flushed to the data Full details on configuring asynchronous inserts can be found [here](/optimize/asynchronous-inserts#enabling-asynchronous-inserts), with a deep dive [here](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse). ::: - ### Use official ClickHouse clients {#use-official-clickhouse-clients} ClickHouse has clients in the most popular programming languages. @@ -148,11 +147,11 @@ See [HTTP Interface](/interfaces/http) for further details. For loading data from Postgres, users can use: - `PeerDB by ClickHouse`, an ETL tool specifically designed for PostgreSQL database replication. This is available in both: - - ClickHouse Cloud - available through our [new connector](/integrations/clickpipes/postgres) in ClickPipes, our managed ingestion service. - - Self-managed - via the [open-source project](https://github.com/PeerDB-io/peerdb). + - ClickHouse Cloud - available through our [new connector](/integrations/clickpipes/postgres) in ClickPipes, our managed ingestion service. + - Self-managed - via the [open-source project](https://github.com/PeerDB-io/peerdb). - The [PostgreSQL table engine](/integrations/postgresql#using-the-postgresql-table-engine) to read data directly as shown in previous examples. Typically appropriate if batch replication based on a known watermark, e.g., timestamp, is sufficient or if it's a one-off migration. This approach can scale to 10's millions of rows. Users looking to migrate larger datasets should consider multiple requests, each dealing with a chunk of the data. Staging tables can be used for each chunk prior to its partitions being moved to a final table. This allows failed requests to be retried. For further details on this bulk-loading strategy, see here. - Data can be exported from PostgreSQL in CSV format. This can then be inserted into ClickHouse from either local files or via object storage using table functions. -:::note Need help inserting large datasets? -If you need help inserting large datasets or encounter any errors when importing data into ClickHouse Cloud, please contact us at support@clickhouse.com and we can assist. -::: + :::note Need help inserting large datasets? + If you need help inserting large datasets or encounter any errors when importing data into ClickHouse Cloud, please contact us at support@clickhouse.com and we can assist. + ::: diff --git a/docs/guides/joining-tables.md b/docs/guides/joining-tables.md index 77ba8d118f6..2fbe1c99d40 100644 --- a/docs/guides/joining-tables.md +++ b/docs/guides/joining-tables.md @@ -18,110 +18,110 @@ ClickHouse has [full `JOIN` support](https://clickhouse.com/blog/clickhouse-full - Currently, ClickHouse does not reorder joins. Always ensure the smallest table is on the right-hand side of the Join. This will be held in memory for most join algorithms and will ensure the lowest memory overhead for the query. - If your query requires a direct join i.e. a `LEFT ANY JOIN` - as shown below, we recommend using [Dictionaries](/dictionary) where possible. - + - If performing inner joins, it is often more optimal to write these as sub-queries using the `IN` clause. Consider the following queries, which are functionally equivalent. Both find the number of `posts` that don't mention ClickHouse in the question but do in the `comments`. -```sql -SELECT count() -FROM stackoverflow.posts AS p -ANY INNER `JOIN` stackoverflow.comments AS c ON p.Id = c.PostId -WHERE (p.Title != '') AND (p.Title NOT ILIKE '%clickhouse%') AND (p.Body NOT ILIKE '%clickhouse%') AND (c.Text ILIKE '%clickhouse%') + ```sql + SELECT count() + FROM stackoverflow.posts AS p + ANY INNER `JOIN` stackoverflow.comments AS c ON p.Id = c.PostId + WHERE (p.Title != '') AND (p.Title NOT ILIKE '%clickhouse%') AND (p.Body NOT ILIKE '%clickhouse%') AND (c.Text ILIKE '%clickhouse%') -┌─count()─┐ -│ 86 │ -└─────────┘ + ┌─count()─┐ + │ 86 │ + └─────────┘ -1 row in set. Elapsed: 8.209 sec. Processed 150.20 million rows, 56.05 GB (18.30 million rows/s., 6.83 GB/s.) -Peak memory usage: 1.23 GiB. -``` + 1 row in set. Elapsed: 8.209 sec. Processed 150.20 million rows, 56.05 GB (18.30 million rows/s., 6.83 GB/s.) + Peak memory usage: 1.23 GiB. + ``` -Note we use an `ANY INNER JOIN` vs. just an `INNER` join as we don't want the cartesian product i.e. we want only one match for each post. + Note we use an `ANY INNER JOIN` vs. just an `INNER` join as we don't want the cartesian product i.e. we want only one match for each post. -This join can be rewritten using a subquery, improving performance significantly: + This join can be rewritten using a subquery, improving performance significantly: -```sql -SELECT count() -FROM stackoverflow.posts -WHERE (Title != '') AND (Title NOT ILIKE '%clickhouse%') AND (Body NOT ILIKE '%clickhouse%') AND (Id IN ( + ```sql + SELECT count() + FROM stackoverflow.posts + WHERE (Title != '') AND (Title NOT ILIKE '%clickhouse%') AND (Body NOT ILIKE '%clickhouse%') AND (Id IN ( SELECT PostId FROM stackoverflow.comments WHERE Text ILIKE '%clickhouse%' -)) -┌─count()─┐ -│ 86 │ -└─────────┘ + )) + ┌─count()─┐ + │ 86 │ + └─────────┘ -1 row in set. Elapsed: 2.284 sec. Processed 150.20 million rows, 16.61 GB (65.76 million rows/s., 7.27 GB/s.) -Peak memory usage: 323.52 MiB. -``` + 1 row in set. Elapsed: 2.284 sec. Processed 150.20 million rows, 16.61 GB (65.76 million rows/s., 7.27 GB/s.) + Peak memory usage: 323.52 MiB. + ``` -Although ClickHouse makes attempts to push down conditions to all join clauses and subqueries, we recommend users always manually apply conditions to all sub-clauses where possible - thus minimizing the size of the data to `JOIN`. Consider the following example below, where we want to compute the number of up-votes for Java-related posts since 2020. + Although ClickHouse makes attempts to push down conditions to all join clauses and subqueries, we recommend users always manually apply conditions to all sub-clauses where possible - thus minimizing the size of the data to `JOIN`. Consider the following example below, where we want to compute the number of up-votes for Java-related posts since 2020. -A naive query, with the larger table on the left side, completes in 56s: + A naive query, with the larger table on the left side, completes in 56s: -```sql -SELECT countIf(VoteTypeId = 2) AS upvotes -FROM stackoverflow.posts AS p -INNER JOIN stackoverflow.votes AS v ON p.Id = v.PostId -WHERE has(arrayFilter(t -> (t != ''), splitByChar('|', p.Tags)), 'java') AND (p.CreationDate >= '2020-01-01') + ```sql + SELECT countIf(VoteTypeId = 2) AS upvotes + FROM stackoverflow.posts AS p + INNER JOIN stackoverflow.votes AS v ON p.Id = v.PostId + WHERE has(arrayFilter(t -> (t != ''), splitByChar('|', p.Tags)), 'java') AND (p.CreationDate >= '2020-01-01') -┌─upvotes─┐ -│ 261915 │ -└─────────┘ + ┌─upvotes─┐ + │ 261915 │ + └─────────┘ -1 row in set. Elapsed: 56.642 sec. Processed 252.30 million rows, 1.62 GB (4.45 million rows/s., 28.60 MB/s.) -``` + 1 row in set. Elapsed: 56.642 sec. Processed 252.30 million rows, 1.62 GB (4.45 million rows/s., 28.60 MB/s.) + ``` -Re-ordering this join improves performance dramatically to 1.5s: + Re-ordering this join improves performance dramatically to 1.5s: -```sql -SELECT countIf(VoteTypeId = 2) AS upvotes -FROM stackoverflow.votes AS v -INNER JOIN stackoverflow.posts AS p ON v.PostId = p.Id -WHERE has(arrayFilter(t -> (t != ''), splitByChar('|', p.Tags)), 'java') AND (p.CreationDate >= '2020-01-01') + ```sql + SELECT countIf(VoteTypeId = 2) AS upvotes + FROM stackoverflow.votes AS v + INNER JOIN stackoverflow.posts AS p ON v.PostId = p.Id + WHERE has(arrayFilter(t -> (t != ''), splitByChar('|', p.Tags)), 'java') AND (p.CreationDate >= '2020-01-01') -┌─upvotes─┐ -│ 261915 │ -└─────────┘ + ┌─upvotes─┐ + │ 261915 │ + └─────────┘ -1 row in set. Elapsed: 1.519 sec. Processed 252.30 million rows, 1.62 GB (166.06 million rows/s., 1.07 GB/s.) -``` + 1 row in set. Elapsed: 1.519 sec. Processed 252.30 million rows, 1.62 GB (166.06 million rows/s., 1.07 GB/s.) + ``` -Adding a filter to the left side table improves performance even further to 0.5s. + Adding a filter to the left side table improves performance even further to 0.5s. -```sql -SELECT countIf(VoteTypeId = 2) AS upvotes -FROM stackoverflow.votes AS v -INNER JOIN stackoverflow.posts AS p ON v.PostId = p.Id -WHERE has(arrayFilter(t -> (t != ''), splitByChar('|', p.Tags)), 'java') AND (p.CreationDate >= '2020-01-01') AND (v.CreationDate >= '2020-01-01') + ```sql + SELECT countIf(VoteTypeId = 2) AS upvotes + FROM stackoverflow.votes AS v + INNER JOIN stackoverflow.posts AS p ON v.PostId = p.Id + WHERE has(arrayFilter(t -> (t != ''), splitByChar('|', p.Tags)), 'java') AND (p.CreationDate >= '2020-01-01') AND (v.CreationDate >= '2020-01-01') -┌─upvotes─┐ -│ 261915 │ -└─────────┘ + ┌─upvotes─┐ + │ 261915 │ + └─────────┘ -1 row in set. Elapsed: 0.597 sec. Processed 81.14 million rows, 1.31 GB (135.82 million rows/s., 2.19 GB/s.) -Peak memory usage: 249.42 MiB. -``` + 1 row in set. Elapsed: 0.597 sec. Processed 81.14 million rows, 1.31 GB (135.82 million rows/s., 2.19 GB/s.) + Peak memory usage: 249.42 MiB. + ``` -This query can be improved even more by moving the `INNER JOIN` to a subquery, as noted earlier, maintaining the filter on both the outer and inner queries. + This query can be improved even more by moving the `INNER JOIN` to a subquery, as noted earlier, maintaining the filter on both the outer and inner queries. -```sql -SELECT count() AS upvotes -FROM stackoverflow.votes -WHERE (VoteTypeId = 2) AND (PostId IN ( + ```sql + SELECT count() AS upvotes + FROM stackoverflow.votes + WHERE (VoteTypeId = 2) AND (PostId IN ( SELECT Id FROM stackoverflow.posts WHERE (CreationDate >= '2020-01-01') AND has(arrayFilter(t -> (t != ''), splitByChar('|', Tags)), 'java') -)) + )) -┌─upvotes─┐ -│ 261915 │ -└─────────┘ + ┌─upvotes─┐ + │ 261915 │ + └─────────┘ -1 row in set. Elapsed: 0.383 sec. Processed 99.64 million rows, 804.55 MB (259.85 million rows/s., 2.10 GB/s.) -Peak memory usage: 250.66 MiB. -``` + 1 row in set. Elapsed: 0.383 sec. Processed 99.64 million rows, 804.55 MB (259.85 million rows/s., 2.10 GB/s.) + Peak memory usage: 250.66 MiB. + ``` ## Choosing a JOIN algorithm {#choosing-a-join-algorithm} @@ -165,11 +165,11 @@ If your key optimization metric is performance and you are looking to execute th - **(4)** If the right table doesn't fit into memory, then it depends again. ClickHouse offers three non-memory bound join algorithms. All three temporarily spill data to disk. **Full sorting merge join** and **partial merge join** require prior sorting of the data. **Grace hash join** is building hash tables from the data instead. Based on the volume of data, the data types and the value distribution of the join key columns, there can be scenarios where building hash tables from the data is faster than sorting the data. And vice versa. -Partial merge join is optimized for minimizing memory usage when large tables are joined, at the expense of join speed which is quite slow. This is especially the case when the physical row order of the left table doesn't match the join key sorting order. + Partial merge join is optimized for minimizing memory usage when large tables are joined, at the expense of join speed which is quite slow. This is especially the case when the physical row order of the left table doesn't match the join key sorting order. -Grace hash join is the most flexible of the three non-memory-bound join algorithms and offers good control of memory usage vs. join speed with its [grace_hash_join_initial_buckets](https://github.com/ClickHouse/ClickHouse/blob/23.5/src/Core/Settings.h#L759) setting. Depending on the data volume the grace hash can be faster or slower than the partial merge algorithm, when the amount of [buckets](https://clickhouse.com/blog/clickhouse-fully-supports-joins-hash-joins-part2#description-2) is chosen such that the memory usage of both algorithms is approximately aligned. When the memory usage of grace hash join is configured to be approximately aligned with the memory usage of full sorting merge, then full sorting merge was always faster in our test runs. + Grace hash join is the most flexible of the three non-memory-bound join algorithms and offers good control of memory usage vs. join speed with its [grace_hash_join_initial_buckets](https://github.com/ClickHouse/ClickHouse/blob/23.5/src/Core/Settings.h#L759) setting. Depending on the data volume the grace hash can be faster or slower than the partial merge algorithm, when the amount of [buckets](https://clickhouse.com/blog/clickhouse-fully-supports-joins-hash-joins-part2#description-2) is chosen such that the memory usage of both algorithms is approximately aligned. When the memory usage of grace hash join is configured to be approximately aligned with the memory usage of full sorting merge, then full sorting merge was always faster in our test runs. -Which one of the three non-memory-bound algorithms is the fastest depends on the volume of data, the data types, and the value distribution of the join key columns. It is always best to run some benchmarks with realistic data volumes of realistic data in order to determine which algorithm is the fastest. + Which one of the three non-memory-bound algorithms is the fastest depends on the volume of data, the data types, and the value distribution of the join key columns. It is always best to run some benchmarks with realistic data volumes of realistic data in order to determine which algorithm is the fastest. ## Optimizing for memory {#optimizing-for-memory} @@ -184,4 +184,4 @@ If you want to optimize a join for the lowest memory usage instead of the fastes - **(1)** If your table's physical row order matches the join key sort order, then the memory usage of the **full sorting merge join** is as low as it gets. With the additional benefit of good join speed because the sorting phase is [disabled](https://clickhouse.com/blog/clickhouse-fully-supports-joins-full-sort-partial-merge-part3#utilizing-physical-row-order). - **(2)** The **grace hash join** can be tuned for very low memory usage by [configuring](https://github.com/ClickHouse/ClickHouse/blob/23.5/src/Core/Settings.h#L759) a high number of [buckets](https://clickhouse.com/blog/clickhouse-fully-supports-joins-hash-joins-part2#description-2) at the expense of join speed. The **partial merge join** intentionally uses a low amount of main memory. The **full sorting merge join** with external sorting enabled generally uses more memory than the partial merge join (assuming the row order does not match the key sort order), with the benefit of significantly better join execution time. -For users needing more details on the above, we recommend the following [blog series](https://clickhouse.com/blog/clickhouse-fully-supports-joins-part1). + For users needing more details on the above, we recommend the following [blog series](https://clickhouse.com/blog/clickhouse-fully-supports-joins-part1). diff --git a/docs/guides/sre/configuring-ssl.md b/docs/guides/sre/configuring-ssl.md index 23cd2d2bfa0..bf2591286a2 100644 --- a/docs/guides/sre/configuring-ssl.md +++ b/docs/guides/sre/configuring-ssl.md @@ -32,12 +32,10 @@ This guide was written using Ubuntu 20.04 and ClickHouse installed on the follow |`chnode2` |192.168.1.222| |`chnode3` |192.168.1.223| - :::note View the [Quick Start](/getting-started/install/install.mdx) for more details on how to install ClickHouse. ::: - ## 2. Create SSL certificates {#2-create-ssl-certificates} :::note Using self-signed certificates are for demonstration purposes only and should not used in production. Certificate requests should be created to be signed by the organization and validated using the CA chain that will be configured in the settings. However, these steps can be used to configure and test settings, then can be replaced by the actual certificates that will be used. @@ -127,7 +125,6 @@ Recommended port is `9281` for ClickHouse Keeper. However, the port is configura For a full explanation of all options, visit https://clickhouse.com/docs/operations/clickhouse-keeper/ ::: - 1. Add the following inside the `` tag in ClickHouse server `config.xml` :::note @@ -239,7 +236,7 @@ For a full explanation of all options, visit https://clickhouse.com/docs/operati ## 5. Configure SSL-TLS interfaces on ClickHouse nodes {#5-configure-ssl-tls-interfaces-on-clickhouse-nodes} The settings below are configured in the ClickHouse server `config.xml` -1. Set the display name for the deployment (optional): +1. Set the display name for the deployment (optional): ```xml clickhouse ``` @@ -301,8 +298,6 @@ The settings below are configured in the ClickHouse server `config.xml` For more information, visit https://clickhouse.com/docs/operations/server-configuration-parameters/settings/#server_configuration_parameters-openssl - - 7. Configure gRPC for SSL on every node: ```xml @@ -381,67 +376,67 @@ The settings below are configured in the ClickHouse server `config.xml` |9444 | ClickHouse Keeper Raft port | 3. Verify ClickHouse Keeper health -The typical [4 letter word (4lW)](/guides/sre/keeper/index.md#four-letter-word-commands) commands will not work using `echo` without TLS, here is how to use the commands with `openssl`. - - Start an interactive session with `openssl` - - ```bash - openssl s_client -connect chnode1.marsnet.local:9281 - ``` - ```response - CONNECTED(00000003) - depth=0 CN = chnode1 - verify error:num=20:unable to get local issuer certificate - verify return:1 - depth=0 CN = chnode1 - verify error:num=21:unable to verify the first certificate - verify return:1 - --- - Certificate chain - 0 s:CN = chnode1 + The typical [4 letter word (4lW)](/guides/sre/keeper/index.md#four-letter-word-commands) commands will not work using `echo` without TLS, here is how to use the commands with `openssl`. + - Start an interactive session with `openssl` + + ```bash + openssl s_client -connect chnode1.marsnet.local:9281 + ``` + ```response + CONNECTED(00000003) + depth=0 CN = chnode1 + verify error:num=20:unable to get local issuer certificate + verify return:1 + depth=0 CN = chnode1 + verify error:num=21:unable to verify the first certificate + verify return:1 + --- + Certificate chain + 0 s:CN = chnode1 i:CN = marsnet.local CA - --- - Server certificate - -----BEGIN CERTIFICATE----- - MIICtDCCAZwCFD321grxU3G5pf6hjitf2u7vkusYMA0GCSqGSIb3DQEBCwUAMBsx - ... - ``` - - - Send the 4LW commands in the openssl session - - ```bash - mntr - ``` - ```response - --- - Post-Handshake New Session Ticket arrived: - SSL-Session: - Protocol : TLSv1.3 - ... - read R BLOCK - zk_version v22.7.3.5-stable-e140b8b5f3a5b660b6b576747063fd040f583cf3 - zk_avg_latency 0 - # highlight-next-line - zk_max_latency 4087 - zk_min_latency 0 - zk_packets_received 4565774 - zk_packets_sent 4565773 - zk_num_alive_connections 2 - zk_outstanding_requests 0 - # highlight-next-line - zk_server_state leader - zk_znode_count 1087 - zk_watch_count 26 - zk_ephemerals_count 12 - zk_approximate_data_size 426062 - zk_key_arena_size 258048 - zk_latest_snapshot_size 0 - zk_open_file_descriptor_count 187 - zk_max_file_descriptor_count 18446744073709551615 - # highlight-next-line - zk_followers 2 - zk_synced_followers 1 - closed - ``` + --- + Server certificate + -----BEGIN CERTIFICATE----- + MIICtDCCAZwCFD321grxU3G5pf6hjitf2u7vkusYMA0GCSqGSIb3DQEBCwUAMBsx + ... + ``` + + - Send the 4LW commands in the openssl session + + ```bash + mntr + ``` + ```response + --- + Post-Handshake New Session Ticket arrived: + SSL-Session: + Protocol : TLSv1.3 + ... + read R BLOCK + zk_version v22.7.3.5-stable-e140b8b5f3a5b660b6b576747063fd040f583cf3 + zk_avg_latency 0 + # highlight-next-line + zk_max_latency 4087 + zk_min_latency 0 + zk_packets_received 4565774 + zk_packets_sent 4565773 + zk_num_alive_connections 2 + zk_outstanding_requests 0 + # highlight-next-line + zk_server_state leader + zk_znode_count 1087 + zk_watch_count 26 + zk_ephemerals_count 12 + zk_approximate_data_size 426062 + zk_key_arena_size 258048 + zk_latest_snapshot_size 0 + zk_open_file_descriptor_count 187 + zk_max_file_descriptor_count 18446744073709551615 + # highlight-next-line + zk_followers 2 + zk_synced_followers 1 + closed + ``` 4. Start the ClickHouse client using `--secure` flag and SSL port: ```bash diff --git a/docs/guides/sre/keeper/index.md b/docs/guides/sre/keeper/index.md index 940e1f2b9bb..e7763c0ba61 100644 --- a/docs/guides/sre/keeper/index.md +++ b/docs/guides/sre/keeper/index.md @@ -100,21 +100,20 @@ The main parameters for each `` are: - `port` — Port where this server listens for connections. - `can_become_leader` — Set to `false` to set up the server as a `learner`. If omitted, the value is `true`. -:::note -In the case of a change in the topology of your ClickHouse Keeper cluster (e.g., replacing a server), please make sure to keep the mapping of `server_id` to `hostname` consistent and avoid shuffling or reusing an existing `server_id` for different servers (e.g., it can happen if your rely on automation scripts to deploy ClickHouse Keeper) - -If the host of a Keeper instance can change, we recommend to define and use a hostname instead of raw IP addresses. Changing hostname is equal to removing and adding the server back which in some cases can be impossible to do (e.g. not enough Keeper instances for quorum). -::: + :::note + In the case of a change in the topology of your ClickHouse Keeper cluster (e.g., replacing a server), please make sure to keep the mapping of `server_id` to `hostname` consistent and avoid shuffling or reusing an existing `server_id` for different servers (e.g., it can happen if your rely on automation scripts to deploy ClickHouse Keeper) -:::note -`async_replication` is disabled by default to avoid breaking backwards compatibility. If you have all your Keeper instances in a cluster running a version supporting `async_replication` (v23.9+), we recommend enabling it because it can improve performance without any downsides. -::: + If the host of a Keeper instance can change, we recommend to define and use a hostname instead of raw IP addresses. Changing hostname is equal to removing and adding the server back which in some cases can be impossible to do (e.g. not enough Keeper instances for quorum). + ::: + :::note + `async_replication` is disabled by default to avoid breaking backwards compatibility. If you have all your Keeper instances in a cluster running a version supporting `async_replication` (v23.9+), we recommend enabling it because it can improve performance without any downsides. + ::: -Examples of configuration for quorum with three nodes can be found in [integration tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/integration) with `test_keeper_` prefix. Example configuration for server #1: + Examples of configuration for quorum with three nodes can be found in [integration tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/integration) with `test_keeper_` prefix. Example configuration for server #1: -```xml - + ```xml + 2181 1 /var/lib/clickhouse/coordination/log @@ -143,8 +142,8 @@ Examples of configuration for quorum with three nodes can be found in [integrati 9234 - -``` + + ``` ### How to run {#how-to-run} @@ -176,238 +175,237 @@ Bellow is the detailed 4lw commands: - `ruok`: Tests if server is running in a non-error state. The server will respond with `imok` if it is running. Otherwise, it will not respond at all. A response of `imok` does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state with respect to quorum and client connection information. -```response -imok -``` + ```response + imok + ``` - `mntr`: Outputs a list of variables that could be used for monitoring the health of the cluster. -```response -zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 -zk_avg_latency 0 -zk_max_latency 0 -zk_min_latency 0 -zk_packets_received 68 -zk_packets_sent 68 -zk_num_alive_connections 1 -zk_outstanding_requests 0 -zk_server_state leader -zk_znode_count 4 -zk_watch_count 1 -zk_ephemerals_count 0 -zk_approximate_data_size 723 -zk_open_file_descriptor_count 310 -zk_max_file_descriptor_count 10240 -zk_followers 0 -zk_synced_followers 0 -``` + ```response + zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 + zk_avg_latency 0 + zk_max_latency 0 + zk_min_latency 0 + zk_packets_received 68 + zk_packets_sent 68 + zk_num_alive_connections 1 + zk_outstanding_requests 0 + zk_server_state leader + zk_znode_count 4 + zk_watch_count 1 + zk_ephemerals_count 0 + zk_approximate_data_size 723 + zk_open_file_descriptor_count 310 + zk_max_file_descriptor_count 10240 + zk_followers 0 + zk_synced_followers 0 + ``` - `srvr`: Lists full details for the server. -```response -ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 -Latency min/avg/max: 0/0/0 -Received: 2 -Sent : 2 -Connections: 1 -Outstanding: 0 -Zxid: 34 -Mode: leader -Node count: 4 -``` + ```response + ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 + Latency min/avg/max: 0/0/0 + Received: 2 + Sent : 2 + Connections: 1 + Outstanding: 0 + Zxid: 34 + Mode: leader + Node count: 4 + ``` - `stat`: Lists brief details for the server and connected clients. -```response -ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 -Clients: - 192.168.1.1:52852(recved=0,sent=0) - 192.168.1.1:52042(recved=24,sent=48) -Latency min/avg/max: 0/0/0 -Received: 4 -Sent : 4 -Connections: 1 -Outstanding: 0 -Zxid: 36 -Mode: leader -Node count: 4 -``` + ```response + ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 + Clients: + 192.168.1.1:52852(recved=0,sent=0) + 192.168.1.1:52042(recved=24,sent=48) + Latency min/avg/max: 0/0/0 + Received: 4 + Sent : 4 + Connections: 1 + Outstanding: 0 + Zxid: 36 + Mode: leader + Node count: 4 + ``` - `srst`: Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`. -```response -Server stats reset. -``` + ```response + Server stats reset. + ``` - `conf`: Print details about serving configuration. -```response -server_id=1 -tcp_port=2181 -four_letter_word_white_list=* -log_storage_path=./coordination/logs -snapshot_storage_path=./coordination/snapshots -max_requests_batch_size=100 -session_timeout_ms=30000 -operation_timeout_ms=10000 -dead_session_check_period_ms=500 -heart_beat_interval_ms=500 -election_timeout_lower_bound_ms=1000 -election_timeout_upper_bound_ms=2000 -reserved_log_items=1000000000000000 -snapshot_distance=10000 -auto_forwarding=true -shutdown_timeout=5000 -startup_timeout=240000 -raft_logs_level=information -snapshots_to_keep=3 -rotate_log_storage_interval=100000 -stale_log_gap=10000 -fresh_log_gap=200 -max_requests_batch_size=100 -quorum_reads=false -force_sync=false -compress_logs=true -compress_snapshots_with_zstd_format=true -configuration_change_tries_count=20 -``` + ```response + server_id=1 + tcp_port=2181 + four_letter_word_white_list=* + log_storage_path=./coordination/logs + snapshot_storage_path=./coordination/snapshots + max_requests_batch_size=100 + session_timeout_ms=30000 + operation_timeout_ms=10000 + dead_session_check_period_ms=500 + heart_beat_interval_ms=500 + election_timeout_lower_bound_ms=1000 + election_timeout_upper_bound_ms=2000 + reserved_log_items=1000000000000000 + snapshot_distance=10000 + auto_forwarding=true + shutdown_timeout=5000 + startup_timeout=240000 + raft_logs_level=information + snapshots_to_keep=3 + rotate_log_storage_interval=100000 + stale_log_gap=10000 + fresh_log_gap=200 + max_requests_batch_size=100 + quorum_reads=false + force_sync=false + compress_logs=true + compress_snapshots_with_zstd_format=true + configuration_change_tries_count=20 + ``` - `cons`: List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc... -```response - 192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0) - 192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0) -``` + ```response + 192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0) + 192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0) + ``` - `crst`: Reset connection/session statistics for all connections. -```response -Connection stats reset. -``` + ```response + Connection stats reset. + ``` - `envi`: Print details about serving environment -```response -Environment: -clickhouse.keeper.version=v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 -host.name=ZBMAC-C02D4054M.local -os.name=Darwin -os.arch=x86_64 -os.version=19.6.0 -cpu.count=12 -user.name=root -user.home=/Users/JackyWoo/ -user.dir=/Users/JackyWoo/project/jd/clickhouse/cmake-build-debug/programs/ -user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/ -``` - + ```response + Environment: + clickhouse.keeper.version=v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 + host.name=ZBMAC-C02D4054M.local + os.name=Darwin + os.arch=x86_64 + os.version=19.6.0 + cpu.count=12 + user.name=root + user.home=/Users/JackyWoo/ + user.dir=/Users/JackyWoo/project/jd/clickhouse/cmake-build-debug/programs/ + user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/ + ``` - `dirs`: Shows the total size of snapshot and log files in bytes -```response -snapshot_dir_size: 0 -log_dir_size: 3875 -``` + ```response + snapshot_dir_size: 0 + log_dir_size: 3875 + ``` - `isro`: Tests if server is running in read-only mode. The server will respond with `ro` if in read-only mode or `rw` if not in read-only mode. -```response -rw -``` + ```response + rw + ``` - `wchs`: Lists brief information on watches for the server. -```response -1 connections watching 1 paths -Total watches:1 -``` + ```response + 1 connections watching 1 paths + Total watches:1 + ``` - `wchc`: Lists detailed information on watches for the server, by session. This outputs a list of sessions (connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (impact server performance), use it carefully. -```response -0x0000000000000001 + ```response + 0x0000000000000001 /clickhouse/task_queue/ddl -``` + ``` - `wchp`: Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (i.e., impact server performance), use it carefully. -```response -/clickhouse/task_queue/ddl + ```response + /clickhouse/task_queue/ddl 0x0000000000000001 -``` + ``` - `dump`: Lists the outstanding sessions and ephemeral nodes. This only works on the leader. -```response -Sessions dump (2): -0x0000000000000001 -0x0000000000000002 -Sessions with Ephemerals (1): -0x0000000000000001 - /clickhouse/task_queue/ddl -``` + ```response + Sessions dump (2): + 0x0000000000000001 + 0x0000000000000002 + Sessions with Ephemerals (1): + 0x0000000000000001 + /clickhouse/task_queue/ddl + ``` - `csnp`: Schedule a snapshot creation task. Return the last committed log index of the scheduled snapshot if success or `Failed to schedule snapshot creation task.` if failed. Note that `lgif` command can help you determine whether the snapshot is done. -```response -100 -``` + ```response + 100 + ``` - `lgif`: Keeper log information. `first_log_idx` : my first log index in log store; `first_log_term` : my first log term; `last_log_idx` : my last log index in log store; `last_log_term` : my last log term; `last_committed_log_idx` : my last committed log index in state machine; `leader_committed_log_idx` : leader's committed log index from my perspective; `target_committed_log_idx` : target log index should be committed to; `last_snapshot_idx` : the largest committed log index in last snapshot. -```response -first_log_idx 1 -first_log_term 1 -last_log_idx 101 -last_log_term 1 -last_committed_log_idx 100 -leader_committed_log_idx 101 -target_committed_log_idx 101 -last_snapshot_idx 50 -``` + ```response + first_log_idx 1 + first_log_term 1 + last_log_idx 101 + last_log_term 1 + last_committed_log_idx 100 + leader_committed_log_idx 101 + target_committed_log_idx 101 + last_snapshot_idx 50 + ``` - `rqld`: Request to become new leader. Return `Sent leadership request to leader.` if request sent or `Failed to send leadership request to leader.` if request not sent. Note that if node is already leader the outcome is same as the request is sent. -```response -Sent leadership request to leader. -``` + ```response + Sent leadership request to leader. + ``` - `ftfl`: Lists all feature flags and whether they are enabled for the Keeper instance. -```response -filtered_list 1 -multi_read 1 -check_not_exists 0 -``` + ```response + filtered_list 1 + multi_read 1 + check_not_exists 0 + ``` - `ydld`: Request to yield leadership and become follower. If the server receiving the request is leader, it will pause write operations first, wait until the successor (current leader can never be successor) finishes the catch-up of the latest log, and then resign. The successor will be chosen automatically. Return `Sent yield leadership request to leader.` if request sent or `Failed to send yield leadership request to leader.` if request not sent. Note that if node is already follower the outcome is same as the request is sent. -```response -Sent yield leadership request to leader. -``` + ```response + Sent yield leadership request to leader. + ``` - `pfev`: Returns the values for all collected events. For each event it returns event name, event value, and event's description. -```response -FileOpen 62 Number of files opened. -Seek 4 Number of times the 'lseek' function was called. -ReadBufferFromFileDescriptorRead 126 Number of reads (read/pread) from a file descriptor. Does not include sockets. -ReadBufferFromFileDescriptorReadFailed 0 Number of times the read (read/pread) from a file descriptor have failed. -ReadBufferFromFileDescriptorReadBytes 178846 Number of bytes read from file descriptors. If the file is compressed, this will show the compressed data size. -WriteBufferFromFileDescriptorWrite 7 Number of writes (write/pwrite) to a file descriptor. Does not include sockets. -WriteBufferFromFileDescriptorWriteFailed 0 Number of times the write (write/pwrite) to a file descriptor have failed. -WriteBufferFromFileDescriptorWriteBytes 153 Number of bytes written to file descriptors. If the file is compressed, this will show compressed data size. -FileSync 2 Number of times the F_FULLFSYNC/fsync/fdatasync function was called for files. -DirectorySync 0 Number of times the F_FULLFSYNC/fsync/fdatasync function was called for directories. -FileSyncElapsedMicroseconds 12756 Total time spent waiting for F_FULLFSYNC/fsync/fdatasync syscall for files. -DirectorySyncElapsedMicroseconds 0 Total time spent waiting for F_FULLFSYNC/fsync/fdatasync syscall for directories. -ReadCompressedBytes 0 Number of bytes (the number of bytes before decompression) read from compressed sources (files, network). -CompressedReadBufferBlocks 0 Number of compressed blocks (the blocks of data that are compressed independent of each other) read from compressed sources (files, network). -CompressedReadBufferBytes 0 Number of uncompressed bytes (the number of bytes after decompression) read from compressed sources (files, network). -AIOWrite 0 Number of writes with Linux or FreeBSD AIO interface -AIOWriteBytes 0 Number of bytes written with Linux or FreeBSD AIO interface -... -``` + ```response + FileOpen 62 Number of files opened. + Seek 4 Number of times the 'lseek' function was called. + ReadBufferFromFileDescriptorRead 126 Number of reads (read/pread) from a file descriptor. Does not include sockets. + ReadBufferFromFileDescriptorReadFailed 0 Number of times the read (read/pread) from a file descriptor have failed. + ReadBufferFromFileDescriptorReadBytes 178846 Number of bytes read from file descriptors. If the file is compressed, this will show the compressed data size. + WriteBufferFromFileDescriptorWrite 7 Number of writes (write/pwrite) to a file descriptor. Does not include sockets. + WriteBufferFromFileDescriptorWriteFailed 0 Number of times the write (write/pwrite) to a file descriptor have failed. + WriteBufferFromFileDescriptorWriteBytes 153 Number of bytes written to file descriptors. If the file is compressed, this will show compressed data size. + FileSync 2 Number of times the F_FULLFSYNC/fsync/fdatasync function was called for files. + DirectorySync 0 Number of times the F_FULLFSYNC/fsync/fdatasync function was called for directories. + FileSyncElapsedMicroseconds 12756 Total time spent waiting for F_FULLFSYNC/fsync/fdatasync syscall for files. + DirectorySyncElapsedMicroseconds 0 Total time spent waiting for F_FULLFSYNC/fsync/fdatasync syscall for directories. + ReadCompressedBytes 0 Number of bytes (the number of bytes before decompression) read from compressed sources (files, network). + CompressedReadBufferBlocks 0 Number of compressed blocks (the blocks of data that are compressed independent of each other) read from compressed sources (files, network). + CompressedReadBufferBytes 0 Number of uncompressed bytes (the number of bytes after decompression) read from compressed sources (files, network). + AIOWrite 0 Number of writes with Linux or FreeBSD AIO interface + AIOWriteBytes 0 Number of bytes written with Linux or FreeBSD AIO interface + ... + ``` ### HTTP control {#http-control} @@ -459,7 +457,7 @@ The following features are available: | `remove_recursive` | Support for `RemoveRecursive` request, which removes the node along with its subtree | `1` | :::note -Some of the feature flags are enabled by default from version 25.7. +Some of the feature flags are enabled by default from version 25.7. The recommended way of upgrading Keeper to 25.7+ is to first upgrade to version 24.9+. ::: @@ -473,23 +471,22 @@ Seamless migration from ZooKeeper to ClickHouse Keeper is not possible. You have 3. Run `clickhouse-keeper-converter` on a leader, for example: -```bash -clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 --zookeeper-snapshots-dir /var/lib/zookeeper/version-2 --output-dir /path/to/clickhouse/keeper/snapshots -``` + ```bash + clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 --zookeeper-snapshots-dir /var/lib/zookeeper/version-2 --output-dir /path/to/clickhouse/keeper/snapshots + ``` 4. Copy snapshot to ClickHouse server nodes with a configured `keeper` or start ClickHouse Keeper instead of ZooKeeper. The snapshot must persist on all nodes, otherwise, empty nodes can be faster and one of them can become a leader. -:::note -`keeper-converter` tool is not available from the Keeper standalone binary. -If you have ClickHouse installed, you can use the binary directly: + :::note + `keeper-converter` tool is not available from the Keeper standalone binary. + If you have ClickHouse installed, you can use the binary directly: -```bash -clickhouse keeper-converter ... -``` - -Otherwise, you can [download the binary](/getting-started/quick-start/oss#download-the-binary) and run the tool as described above without installing ClickHouse. -::: + ```bash + clickhouse keeper-converter ... + ``` + Otherwise, you can [download the binary](/getting-started/quick-start/oss#download-the-binary) and run the tool as described above without installing ClickHouse. + ::: ### Recovering after losing quorum {#recovering-after-losing-quorum} @@ -507,7 +504,7 @@ Important things to note before continuing: - Make sure that the failed nodes cannot connect to the cluster again. - Do not start any of the new nodes until it's specified in the steps. -After making sure that the above things are true, you need to do following: + After making sure that the above things are true, you need to do following: 1. Pick a single Keeper node to be your new leader. Be aware that the data of that node will be used for the entire cluster, so we recommend using a node with the most up-to-date state. 2. Before doing anything else, make a backup of the `log_storage_path` and `snapshot_storage_path` folders of the picked node. 3. Reconfigure the cluster on all of the nodes you want to use. @@ -525,10 +522,10 @@ Supported types of disks are: - s3 - local -Following is an example of disk definitions contained inside a config. + Following is an example of disk definitions contained inside a config. -```xml - + ```xml + @@ -559,28 +556,28 @@ Following is an example of disk definitions contained inside a config. - -``` + + ``` -To use a disk for logs `keeper_server.log_storage_disk` config should be set to the name of disk. -To use a disk for snapshots `keeper_server.snapshot_storage_disk` config should be set to the name of disk. -Additionally, different disks can be used for the latest logs or snapshots by using `keeper_server.latest_log_storage_disk` and `keeper_server.latest_snapshot_storage_disk` respectively. -In that case, Keeper will automatically move files to correct disks when new logs or snapshots are created. -To use a disk for state file, `keeper_server.state_storage_disk` config should be set to the name of disk. + To use a disk for logs `keeper_server.log_storage_disk` config should be set to the name of disk. + To use a disk for snapshots `keeper_server.snapshot_storage_disk` config should be set to the name of disk. + Additionally, different disks can be used for the latest logs or snapshots by using `keeper_server.latest_log_storage_disk` and `keeper_server.latest_snapshot_storage_disk` respectively. + In that case, Keeper will automatically move files to correct disks when new logs or snapshots are created. + To use a disk for state file, `keeper_server.state_storage_disk` config should be set to the name of disk. -Moving files between disks is safe and there is no risk of losing data if Keeper stops in the middle of transfer. -Until the file is completely moved to the new disk, it's not deleted from the old one. + Moving files between disks is safe and there is no risk of losing data if Keeper stops in the middle of transfer. + Until the file is completely moved to the new disk, it's not deleted from the old one. -Keeper with `keeper_server.coordination_settings.force_sync` set to `true` (`true` by default) cannot satisfy some guarantees for all types of disks. -Right now, only disks of type `local` support persistent sync. -If `force_sync` is used, `log_storage_disk` should be a `local` disk if `latest_log_storage_disk` is not used. -If `latest_log_storage_disk` is used, it should always be a `local` disk. -If `force_sync` is disabled, disks of all types can be used in any setup. + Keeper with `keeper_server.coordination_settings.force_sync` set to `true` (`true` by default) cannot satisfy some guarantees for all types of disks. + Right now, only disks of type `local` support persistent sync. + If `force_sync` is used, `log_storage_disk` should be a `local` disk if `latest_log_storage_disk` is not used. + If `latest_log_storage_disk` is used, it should always be a `local` disk. + If `force_sync` is disabled, disks of all types can be used in any setup. -A possible storage setup for a Keeper instance could look like following: + A possible storage setup for a Keeper instance could look like following: -```xml - + ```xml + log_s3_plain log_local @@ -588,11 +585,11 @@ A possible storage setup for a Keeper instance could look like following: snapshot_s3_plain snapshot_local - -``` + + ``` -This instance will store all but the latest logs on disk `log_s3_plain`, while the latest log will be on the `log_local` disk. -Same logic applies for snapshots, all but the latest snapshots will be stored on `snapshot_s3_plain`, while the latest snapshot will be on the `snapshot_local` disk. + This instance will store all but the latest logs on disk `log_s3_plain`, while the latest log will be on the `log_local` disk. + Same logic applies for snapshots, all but the latest snapshots will be stored on `snapshot_s3_plain`, while the latest snapshot will be on the `snapshot_local` disk. ### Changing disk setup {#changing-disk-setup} @@ -633,13 +630,12 @@ The limit is controlled with these two configs: - `latest_logs_cache_size_threshold` - total size of latest logs stored in cache - `commit_logs_cache_size_threshold` - total size of subsequent logs that need to be committed next -If the default values are too big, you can reduce the memory usage by reducing these two configs. - -:::note -You can use `pfev` command to check amount of logs read from each cache and from a file. -You can also use metrics from Prometheus endpoint to track the current size of both caches. -::: + If the default values are too big, you can reduce the memory usage by reducing these two configs. + :::note + You can use `pfev` command to check amount of logs read from each cache and from a file. + You can also use metrics from Prometheus endpoint to track the current size of both caches. + ::: ## Prometheus {#prometheus} @@ -653,10 +649,10 @@ Settings: - `events` – Flag that sets to expose metrics from the [system.events](/operations/system-tables/events) table. - `asynchronous_metrics` – Flag that sets to expose current metrics values from the [system.asynchronous_metrics](/operations/system-tables/asynchronous_metrics) table. -**Example** + **Example** -``` xml - + ``` xml + 0.0.0.0 8123 9000 @@ -669,15 +665,15 @@ Settings: true - -``` + + ``` -Check (replace `127.0.0.1` with the IP addr or hostname of your ClickHouse server): -```bash -curl 127.0.0.1:9363/metrics -``` + Check (replace `127.0.0.1` with the IP addr or hostname of your ClickHouse server): + ```bash + curl 127.0.0.1:9363/metrics + ``` -Please also see the ClickHouse Cloud [Prometheus integration](/integrations/prometheus). + Please also see the ClickHouse Cloud [Prometheus integration](/integrations/prometheus). ## ClickHouse Keeper user guide {#clickhouse-keeper-user-guide} @@ -739,8 +735,7 @@ This guide provides simple and minimal settings to configure ClickHouse Keeper w |hostname |hostname, IP or FQDN of each server in the keeper cluster|`chnode1.domain.com`| |port|port to listen on for interserver keeper connections|9234| - -4. Enable the Zookeeper component. It will use the ClickHouse Keeper engine: +4. Enable the Zookeeper component. It will use the ClickHouse Keeper engine: ```xml @@ -788,7 +783,6 @@ This guide provides simple and minimal settings to configure ClickHouse Keeper w └────────────┴───────┴───────┴───────┴─────────────────────┴─────────────────────┴─────────┴──────────┴──────────┴────────────────┴────────────┴─────────────┴───────┴─────────────┘ ``` - ### 2. Configure a cluster in ClickHouse {#2--configure-a-cluster-in-clickhouse} 1. Let's configure a simple cluster with 2 shards and only one replica on 2 of the nodes. The third node will be used to achieve a quorum for the requirement in ClickHouse Keeper. Update the configuration on `chnode1` and `chnode2`. The following cluster defines 1 shard on each node for a total of 2 shards with no replication. In this example, some of the data will be on node and some will be on the other node: @@ -824,7 +818,6 @@ This guide provides simple and minimal settings to configure ClickHouse Keeper w |user|username that will be used to authenticate to the cluster instances|default| |password|password for the user define to allow connections to cluster instances|`ClickHouse123!`| - 2. Restart ClickHouse and verify the cluster was created: ```bash SHOW clusters; @@ -839,7 +832,7 @@ This guide provides simple and minimal settings to configure ClickHouse Keeper w ### 3. Create and test distributed table {#3-create-and-test-distributed-table} -1. Create a new database on the new cluster using ClickHouse client on `chnode1`. The `ON CLUSTER` clause automatically creates the database on both nodes. +1. Create a new database on the new cluster using ClickHouse client on `chnode1`. The `ON CLUSTER` clause automatically creates the database on both nodes. ```sql CREATE DATABASE db1 ON CLUSTER 'cluster_2S_1R'; ``` @@ -891,7 +884,7 @@ This guide provides simple and minimal settings to configure ClickHouse Keeper w ``` On `chnode2`: -6. +6. ```sql SELECT * FROM db1.table1 @@ -940,7 +933,6 @@ This guide provides simple and minimal settings to configure ClickHouse Keeper w This guide demonstrated how to set up a cluster using ClickHouse Keeper. With ClickHouse Keeper, you can configure clusters and define distributed tables that can be replicated across shards. - ## Configuring ClickHouse Keeper with unique paths {#configuring-clickhouse-keeper-with-unique-paths} @@ -993,211 +985,211 @@ Example config for cluster: ### Procedures to set up tables to use `{uuid}` {#procedures-to-set-up-tables-to-use-uuid} 1. Configure Macros on each server -example for server 1: -```xml + example for server 1: + ```xml 1 replica_1 -``` -:::note -Notice that we define macros for `shard` and `replica`, but that `{uuid}` is not defined here, it is built-in and there is no need to define. -::: + ``` + :::note + Notice that we define macros for `shard` and `replica`, but that `{uuid}` is not defined here, it is built-in and there is no need to define. + ::: 2. Create a Database -```sql -CREATE DATABASE db_uuid + ```sql + CREATE DATABASE db_uuid ON CLUSTER 'cluster_1S_2R' ENGINE Atomic; -``` + ``` -```response -CREATE DATABASE db_uuid ON CLUSTER cluster_1S_2R -ENGINE = Atomic + ```response + CREATE DATABASE db_uuid ON CLUSTER cluster_1S_2R + ENGINE = Atomic -Query id: 07fb7e65-beb4-4c30-b3ef-bd303e5c42b5 + Query id: 07fb7e65-beb4-4c30-b3ef-bd303e5c42b5 -┌─host──────────────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ -│ chnode2.marsnet.local │ 9440 │ 0 │ │ 1 │ 0 │ -│ chnode1.marsnet.local │ 9440 │ 0 │ │ 0 │ 0 │ -└───────────────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ -``` + ┌─host──────────────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ + │ chnode2.marsnet.local │ 9440 │ 0 │ │ 1 │ 0 │ + │ chnode1.marsnet.local │ 9440 │ 0 │ │ 0 │ 0 │ + └───────────────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ + ``` 3. Create a table on the cluster using the macros and `{uuid}` -```sql -CREATE TABLE db_uuid.uuid_table1 ON CLUSTER 'cluster_1S_2R' - ( + ```sql + CREATE TABLE db_uuid.uuid_table1 ON CLUSTER 'cluster_1S_2R' + ( id UInt64, column1 String - ) - ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/db_uuid/{uuid}', '{replica}' ) - ORDER BY (id); -``` + ) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/db_uuid/{uuid}', '{replica}' ) + ORDER BY (id); + ``` -```response -CREATE TABLE db_uuid.uuid_table1 ON CLUSTER cluster_1S_2R -( + ```response + CREATE TABLE db_uuid.uuid_table1 ON CLUSTER cluster_1S_2R + ( `id` UInt64, `column1` String -) -ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/db_uuid/{uuid}', '{replica}') -ORDER BY id + ) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/db_uuid/{uuid}', '{replica}') + ORDER BY id -Query id: 8f542664-4548-4a02-bd2a-6f2c973d0dc4 + Query id: 8f542664-4548-4a02-bd2a-6f2c973d0dc4 -┌─host──────────────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ -│ chnode1.marsnet.local │ 9440 │ 0 │ │ 1 │ 0 │ -│ chnode2.marsnet.local │ 9440 │ 0 │ │ 0 │ 0 │ -└───────────────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ -``` + ┌─host──────────────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ + │ chnode1.marsnet.local │ 9440 │ 0 │ │ 1 │ 0 │ + │ chnode2.marsnet.local │ 9440 │ 0 │ │ 0 │ 0 │ + └───────────────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ + ``` -4. Create a distributed table +4. Create a distributed table -```sql -CREATE TABLE db_uuid.dist_uuid_table1 ON CLUSTER 'cluster_1S_2R' - ( + ```sql + CREATE TABLE db_uuid.dist_uuid_table1 ON CLUSTER 'cluster_1S_2R' + ( id UInt64, column1 String - ) - ENGINE = Distributed('cluster_1S_2R', 'db_uuid', 'uuid_table1' ); -``` + ) + ENGINE = Distributed('cluster_1S_2R', 'db_uuid', 'uuid_table1' ); + ``` -```response -CREATE TABLE db_uuid.dist_uuid_table1 ON CLUSTER cluster_1S_2R -( + ```response + CREATE TABLE db_uuid.dist_uuid_table1 ON CLUSTER cluster_1S_2R + ( `id` UInt64, `column1` String -) -ENGINE = Distributed('cluster_1S_2R', 'db_uuid', 'uuid_table1') + ) + ENGINE = Distributed('cluster_1S_2R', 'db_uuid', 'uuid_table1') -Query id: 3bc7f339-ab74-4c7d-a752-1ffe54219c0e + Query id: 3bc7f339-ab74-4c7d-a752-1ffe54219c0e -┌─host──────────────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ -│ chnode2.marsnet.local │ 9440 │ 0 │ │ 1 │ 0 │ -│ chnode1.marsnet.local │ 9440 │ 0 │ │ 0 │ 0 │ -└───────────────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ -``` + ┌─host──────────────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ + │ chnode2.marsnet.local │ 9440 │ 0 │ │ 1 │ 0 │ + │ chnode1.marsnet.local │ 9440 │ 0 │ │ 0 │ 0 │ + └───────────────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ + ``` ### Testing {#testing} -1. Insert data into first node (e.g `chnode1`) -```sql -INSERT INTO db_uuid.uuid_table1 - ( id, column1) - VALUES - ( 1, 'abc'); -``` +1. Insert data into first node (e.g `chnode1`) + ```sql + INSERT INTO db_uuid.uuid_table1 + ( id, column1) + VALUES + ( 1, 'abc'); + ``` -```response -INSERT INTO db_uuid.uuid_table1 (id, column1) FORMAT Values + ```response + INSERT INTO db_uuid.uuid_table1 (id, column1) FORMAT Values -Query id: 0f178db7-50a6-48e2-9a1b-52ed14e6e0f9 + Query id: 0f178db7-50a6-48e2-9a1b-52ed14e6e0f9 -Ok. + Ok. -1 row in set. Elapsed: 0.033 sec. -``` + 1 row in set. Elapsed: 0.033 sec. + ``` 2. Insert data into second node (e.g., `chnode2`) -```sql -INSERT INTO db_uuid.uuid_table1 - ( id, column1) - VALUES - ( 2, 'def'); -``` + ```sql + INSERT INTO db_uuid.uuid_table1 + ( id, column1) + VALUES + ( 2, 'def'); + ``` -```response -INSERT INTO db_uuid.uuid_table1 (id, column1) FORMAT Values + ```response + INSERT INTO db_uuid.uuid_table1 (id, column1) FORMAT Values -Query id: edc6f999-3e7d-40a0-8a29-3137e97e3607 + Query id: edc6f999-3e7d-40a0-8a29-3137e97e3607 -Ok. + Ok. -1 row in set. Elapsed: 0.529 sec. -``` + 1 row in set. Elapsed: 0.529 sec. + ``` 3. View records using distributed table -```sql -SELECT * FROM db_uuid.dist_uuid_table1; -``` + ```sql + SELECT * FROM db_uuid.dist_uuid_table1; + ``` -```response -SELECT * -FROM db_uuid.dist_uuid_table1 + ```response + SELECT * + FROM db_uuid.dist_uuid_table1 -Query id: 6cbab449-9e7f-40fe-b8c2-62d46ba9f5c8 + Query id: 6cbab449-9e7f-40fe-b8c2-62d46ba9f5c8 -┌─id─┬─column1─┐ -│ 1 │ abc │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 2 │ def │ -└────┴─────────┘ + ┌─id─┬─column1─┐ + │ 1 │ abc │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 2 │ def │ + └────┴─────────┘ -2 rows in set. Elapsed: 0.007 sec. -``` + 2 rows in set. Elapsed: 0.007 sec. + ``` ### Alternatives {#alternatives} The default replication path can be defined beforehand by macros and using also `{uuid}` 1. Set default for tables on each node -```xml -/clickhouse/tables/{shard}/db_uuid/{uuid} -{replica} -``` -:::tip -You can also define a macro `{database}` on each node if nodes are used for certain databases. -::: + ```xml + /clickhouse/tables/{shard}/db_uuid/{uuid} + {replica} + ``` + :::tip + You can also define a macro `{database}` on each node if nodes are used for certain databases. + ::: 2. Create table without explicit parameters: -```sql -CREATE TABLE db_uuid.uuid_table1 ON CLUSTER 'cluster_1S_2R' - ( + ```sql + CREATE TABLE db_uuid.uuid_table1 ON CLUSTER 'cluster_1S_2R' + ( id UInt64, column1 String - ) - ENGINE = ReplicatedMergeTree - ORDER BY (id); -``` + ) + ENGINE = ReplicatedMergeTree + ORDER BY (id); + ``` -```response -CREATE TABLE db_uuid.uuid_table1 ON CLUSTER cluster_1S_2R -( + ```response + CREATE TABLE db_uuid.uuid_table1 ON CLUSTER cluster_1S_2R + ( `id` UInt64, `column1` String -) -ENGINE = ReplicatedMergeTree -ORDER BY id + ) + ENGINE = ReplicatedMergeTree + ORDER BY id -Query id: ab68cda9-ae41-4d6d-8d3b-20d8255774ee + Query id: ab68cda9-ae41-4d6d-8d3b-20d8255774ee -┌─host──────────────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ -│ chnode2.marsnet.local │ 9440 │ 0 │ │ 1 │ 0 │ -│ chnode1.marsnet.local │ 9440 │ 0 │ │ 0 │ 0 │ -└───────────────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ + ┌─host──────────────────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ + │ chnode2.marsnet.local │ 9440 │ 0 │ │ 1 │ 0 │ + │ chnode1.marsnet.local │ 9440 │ 0 │ │ 0 │ 0 │ + └───────────────────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ -2 rows in set. Elapsed: 1.175 sec. -``` + 2 rows in set. Elapsed: 1.175 sec. + ``` 3. Verify it used the settings used in default config -```sql -SHOW CREATE TABLE db_uuid.uuid_table1; -``` + ```sql + SHOW CREATE TABLE db_uuid.uuid_table1; + ``` -```response -SHOW CREATE TABLE db_uuid.uuid_table1 + ```response + SHOW CREATE TABLE db_uuid.uuid_table1 -CREATE TABLE db_uuid.uuid_table1 -( + CREATE TABLE db_uuid.uuid_table1 + ( `id` UInt64, `column1` String -) -ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/db_uuid/{uuid}', '{replica}') -ORDER BY id + ) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/db_uuid/{uuid}', '{replica}') + ORDER BY id -1 row in set. Elapsed: 0.003 sec. -``` + 1 row in set. Elapsed: 0.003 sec. + ``` ### Troubleshooting {#troubleshooting} @@ -1268,38 +1260,38 @@ server.id2 = ... - Each server entry is delimited by a newline. - `server_type` is either `participant` or `learner` ([learner](https://github.com/eBay/NuRaft/blob/master/docs/readonly_member.md) does not participate in leader elections). - `server_priority` is a non-negative integer telling [which nodes should be prioritised on leader elections](https://github.com/eBay/NuRaft/blob/master/docs/leader_election_priority.md). - Priority of 0 means server will never be a leader. + Priority of 0 means server will never be a leader. -Example: + Example: -```sql -:) get /keeper/config -server.1=zoo1:9234;participant;1 -server.2=zoo2:9234;participant;1 -server.3=zoo3:9234;participant;1 -``` + ```sql + :) get /keeper/config + server.1=zoo1:9234;participant;1 + server.2=zoo2:9234;participant;1 + server.3=zoo3:9234;participant;1 + ``` -You can use `reconfig` command to add new servers, remove existing ones, and change existing servers' -priorities, here are examples (using `clickhouse-keeper-client`): + You can use `reconfig` command to add new servers, remove existing ones, and change existing servers' + priorities, here are examples (using `clickhouse-keeper-client`): -```bash -# Add two new servers -reconfig add "server.5=localhost:123,server.6=localhost:234;learner" -# Remove two other servers -reconfig remove "3,4" -# Change existing server priority to 8 -reconfig add "server.5=localhost:5123;participant;8" -``` + ```bash + # Add two new servers + reconfig add "server.5=localhost:123,server.6=localhost:234;learner" + # Remove two other servers + reconfig remove "3,4" + # Change existing server priority to 8 + reconfig add "server.5=localhost:5123;participant;8" + ``` -And here are examples for `kazoo`: + And here are examples for `kazoo`: -```python -# Add two new servers, remove two other servers -reconfig(joining="server.5=localhost:123,server.6=localhost:234;learner", leaving="3,4") + ```python + # Add two new servers, remove two other servers + reconfig(joining="server.5=localhost:123,server.6=localhost:234;learner", leaving="3,4") -# Change existing server priority to 8 -reconfig(joining="server.5=localhost:5123;participant;8", leaving=None) -``` + # Change existing server priority to 8 + reconfig(joining="server.5=localhost:5123;participant;8", leaving=None) + ``` Servers in `joining` should be in server format described above. Server entries should be delimited by commas. While adding new servers, you can omit `server_priority` (default value is 1) and `server_type` (default value @@ -1315,23 +1307,23 @@ There are some caveats in Keeper reconfiguration implementation: - Only incremental reconfiguration is supported. Requests with non-empty `new_members` are declined. - ClickHouse Keeper implementation relies on NuRaft API to change membership dynamically. NuRaft has a way to - add a single server or remove a single server, one at a time. This means each change to configuration - (each part of `joining`, each part of `leaving`) must be decided on separately. Thus there is no bulk - reconfiguration available as it would be misleading for end users. + ClickHouse Keeper implementation relies on NuRaft API to change membership dynamically. NuRaft has a way to + add a single server or remove a single server, one at a time. This means each change to configuration + (each part of `joining`, each part of `leaving`) must be decided on separately. Thus there is no bulk + reconfiguration available as it would be misleading for end users. - Changing server type (participant/learner) isn't possible either as it's not supported by NuRaft, and - the only way would be to remove and add server, which again would be misleading. + Changing server type (participant/learner) isn't possible either as it's not supported by NuRaft, and + the only way would be to remove and add server, which again would be misleading. - You cannot use the returned `znodestat` value. - The `from_version` field is not used. All requests with set `from_version` are declined. - This is due to the fact `/keeper/config` is a virtual node, which means it is not stored in - persistent storage, but rather generated on-the-fly with the specified node config for every request. - This decision was made as to not duplicate data as NuRaft already stores this config. + This is due to the fact `/keeper/config` is a virtual node, which means it is not stored in + persistent storage, but rather generated on-the-fly with the specified node config for every request. + This decision was made as to not duplicate data as NuRaft already stores this config. - Unlike ZooKeeper, there is no way to wait on cluster reconfiguration by submitting a `sync` command. - New config will be _eventually_ applied but with no time guarantees. + New config will be _eventually_ applied but with no time guarantees. - `reconfig` command may fail for various reasons. You can check cluster's state and see whether the update - was applied. + was applied. ## Converting a single-node keeper into a cluster {#converting-a-single-node-keeper-into-a-cluster} @@ -1345,7 +1337,7 @@ Sometimes it's necessary to extend experimental keeper node into a cluster. Here - Update the `clickhouse-server` configuration by adding new keeper node there and restart it to apply the changes. - Update the raft configuration of the node 1 and, optionally, restart it. -To get confident with the process, here's a [sandbox repository](https://github.com/ClickHouse/keeper-extend-cluster). + To get confident with the process, here's a [sandbox repository](https://github.com/ClickHouse/keeper-extend-cluster). ## Unsupported features {#unsupported-features} diff --git a/docs/guides/sre/network-ports.md b/docs/guides/sre/network-ports.md index ce773f72c85..7e28f510db3 100644 --- a/docs/guides/sre/network-ports.md +++ b/docs/guides/sre/network-ports.md @@ -30,4 +30,3 @@ Ports described as **default** mean that the port number is configured in `/etc/ |9281|Recommended Secure SSL ClickHouse Keeper port||✓| |9440|Native protocol SSL/TLS port|✓|✓| |42000|Graphite default port||✓| - diff --git a/docs/guides/sre/user-management/configuring-ldap.md b/docs/guides/sre/user-management/configuring-ldap.md index e2ca9b41230..4c655d24c86 100644 --- a/docs/guides/sre/user-management/configuring-ldap.md +++ b/docs/guides/sre/user-management/configuring-ldap.md @@ -109,7 +109,6 @@ ClickHouse can be configured to use LDAP to authenticate ClickHouse database use |search_filter|ldap search filter to identify groups to select for mapping users |`(&(objectClass=groupOfUniqueNames)(uniqueMember={bind_dn}))`| |attribute |which attribute name should value be returned from|cn| - 4. Restart your ClickHouse server to apply the settings. ## 2. Configure ClickHouse database roles and permissions {#2-configure-clickhouse-database-roles-and-permissions} @@ -144,7 +143,7 @@ The procedures in this section assumes that SQL Access Control and Account Manag Use the `ldapsearch` command in step 1 to view all of the users available in the directory and for all of the users the password is `password` ::: -2. Test that the user was mapped correctly to the `scientists_role` role and has admin permissions +2. Test that the user was mapped correctly to the `scientists_role` role and has admin permissions ```sql SHOW DATABASES ``` @@ -169,5 +168,3 @@ The procedures in this section assumes that SQL Access Control and Account Manag ## Summary {#summary} This article demonstrated the basics of configuring ClickHouse to authenticate to an LDAP server and also to map to a role. There are also options for configuring individual users in ClickHouse but having those users be authenticated by LDAP without configuring automated role mapping. The LDAP module can also be used to connect to Active Directory. - - diff --git a/docs/guides/sre/user-management/index.md b/docs/guides/sre/user-management/index.md index 8e344d246bb..3074b31016f 100644 --- a/docs/guides/sre/user-management/index.md +++ b/docs/guides/sre/user-management/index.md @@ -18,7 +18,7 @@ ClickHouse access entities: - [Settings Profile](#settings-profiles-management) - [Quota](#quotas-management) -You can configure access entities using: + You can configure access entities using: - SQL-driven workflow. @@ -26,17 +26,17 @@ You can configure access entities using: - Server [configuration files](/operations/configuration-files.md) `users.xml` and `config.xml`. -We recommend using SQL-driven workflow. Both of the configuration methods work simultaneously, so if you use the server configuration files for managing accounts and access rights, you can smoothly switch to SQL-driven workflow. + We recommend using SQL-driven workflow. Both of the configuration methods work simultaneously, so if you use the server configuration files for managing accounts and access rights, you can smoothly switch to SQL-driven workflow. -:::note -You can't manage the same access entity by both configuration methods simultaneously. -::: + :::note + You can't manage the same access entity by both configuration methods simultaneously. + ::: -:::note -If you are looking to manage ClickHouse Cloud console users, please refer to this [page](/cloud/security/cloud-access-management) -::: + :::note + If you are looking to manage ClickHouse Cloud console users, please refer to this [page](/cloud/security/cloud-access-management) + ::: -To see all users, roles, profiles, etc. and all their grants use [`SHOW ACCESS`](/sql-reference/statements/show#show-access) statement. + To see all users, roles, profiles, etc. and all their grants use [`SHOW ACCESS`](/sql-reference/statements/show#show-access) statement. ## Overview {#access-control-usage} @@ -44,9 +44,9 @@ By default, the ClickHouse server provides the `default` user account which is n If you just started using ClickHouse, consider the following scenario: -1. [Enable](#enabling-access-control) SQL-driven access control and account management for the `default` user. -2. Log in to the `default` user account and create all the required users. Don't forget to create an administrator account (`GRANT ALL ON *.* TO admin_user_account WITH GRANT OPTION`). -3. [Restrict permissions](/operations/settings/permissions-for-queries) for the `default` user and disable SQL-driven access control and account management for it. +1. [Enable](#enabling-access-control) SQL-driven access control and account management for the `default` user. +2. Log in to the `default` user account and create all the required users. Don't forget to create an administrator account (`GRANT ALL ON *.* TO admin_user_account WITH GRANT OPTION`). +3. [Restrict permissions](/operations/settings/permissions-for-queries) for the `default` user and disable SQL-driven access control and account management for it. ### Properties of current solution {#access-control-properties} @@ -65,9 +65,9 @@ A user account is an access entity that allows to authorize someone in ClickHous - Settings with their constraints applied by default at user login. - Assigned settings profiles. -Privileges can be granted to a user account by the [GRANT](/sql-reference/statements/grant.md) query or by assigning [roles](#role-management). To revoke privileges from a user, ClickHouse provides the [REVOKE](/sql-reference/statements/revoke.md) query. To list privileges for a user, use the [SHOW GRANTS](/sql-reference/statements/show#show-grants) statement. + Privileges can be granted to a user account by the [GRANT](/sql-reference/statements/grant.md) query or by assigning [roles](#role-management). To revoke privileges from a user, ClickHouse provides the [REVOKE](/sql-reference/statements/revoke.md) query. To list privileges for a user, use the [SHOW GRANTS](/sql-reference/statements/show#show-grants) statement. -Management queries: + Management queries: - [CREATE USER](/sql-reference/statements/create/user.md) - [ALTER USER](/sql-reference/statements/alter/user) @@ -79,10 +79,10 @@ Management queries: Settings can be configured differently: for a user account, in its granted roles and in settings profiles. At user login, if a setting is configured for different access entities, the value and constraints of this setting are applied as follows (from higher to lower priority): -1. User account settings. -2. The settings for the default roles of the user account. If a setting is configured in some roles, then order of the setting application is undefined. -3. The settings from settings profiles assigned to a user or to its default roles. If a setting is configured in some profiles, then order of setting application is undefined. -4. Settings applied to the entire server by default or from the [default profile](/operations/server-configuration-parameters/settings#default_profile). +1. User account settings. +2. The settings for the default roles of the user account. If a setting is configured in some roles, then order of the setting application is undefined. +3. The settings from settings profiles assigned to a user or to its default roles. If a setting is configured in some profiles, then order of setting application is undefined. +4. Settings applied to the entire server by default or from the [default profile](/operations/server-configuration-parameters/settings#default_profile). ### Role {#role-management} @@ -94,7 +94,7 @@ A role contains: - Settings and constraints - List of assigned roles -Management queries: + Management queries: - [CREATE ROLE](/sql-reference/statements/create/role) - [ALTER ROLE](/sql-reference/statements/alter/role) @@ -104,7 +104,7 @@ Management queries: - [SHOW CREATE ROLE](/sql-reference/statements/show#show-create-role) - [SHOW ROLES](/sql-reference/statements/show#show-roles) -Privileges can be granted to a role by the [GRANT](/sql-reference/statements/grant.md) query. To revoke privileges from a role ClickHouse provides the [REVOKE](/sql-reference/statements/revoke.md) query. + Privileges can be granted to a role by the [GRANT](/sql-reference/statements/grant.md) query. To revoke privileges from a role ClickHouse provides the [REVOKE](/sql-reference/statements/revoke.md) query. #### Row policy {#row-policy-management} @@ -159,7 +159,6 @@ Management queries: By default, SQL-driven access control and account management is disabled for all users. You need to configure at least one user in the `users.xml` configuration file and set the values of the [`access_management`](/operations/settings/settings-users.md#access_management-user-setting), `named_collection_control`, `show_named_collections`, and `show_named_collections_secrets` settings to 1. - ## Defining SQL users and roles {#defining-sql-users-and-roles} :::tip @@ -170,7 +169,7 @@ This article shows the basics of defining SQL users and roles and applying those ### Enabling SQL user mode {#enabling-sql-user-mode} -1. Enable SQL user mode in the `users.xml` file under the `` user: +1. Enable SQL user mode in the `users.xml` file under the `` user: ```xml 1 1 @@ -207,336 +206,333 @@ This article is intended to provide you with a better understanding of how to de The `ALTER` statements are divided into several categories, some of which are hierarchical and some of which are not and must be explicitly defined. - **Example DB, table and user configuration** 1. With an admin user, create a sample user -```sql -CREATE USER my_user IDENTIFIED BY 'password'; -``` + ```sql + CREATE USER my_user IDENTIFIED BY 'password'; + ``` 2. Create sample database -```sql -CREATE DATABASE my_db; -``` + ```sql + CREATE DATABASE my_db; + ``` 3. Create a sample table -```sql -CREATE TABLE my_db.my_table (id UInt64, column1 String) ENGINE = MergeTree() ORDER BY id; -``` + ```sql + CREATE TABLE my_db.my_table (id UInt64, column1 String) ENGINE = MergeTree() ORDER BY id; + ``` 4. Create a sample admin user to grant/revoke privileges -```sql -CREATE USER my_alter_admin IDENTIFIED BY 'password'; -``` + ```sql + CREATE USER my_alter_admin IDENTIFIED BY 'password'; + ``` -:::note -To grant or revoke permissions, the admin user must have the `WITH GRANT OPTION` privilege. -For example: - ```sql - GRANT ALTER ON my_db.* WITH GRANT OPTION - ``` -To `GRANT` or `REVOKE` privileges, the user must have those privileges themselves first. -::: + :::note + To grant or revoke permissions, the admin user must have the `WITH GRANT OPTION` privilege. + For example: + ```sql + GRANT ALTER ON my_db.* WITH GRANT OPTION + ``` + To `GRANT` or `REVOKE` privileges, the user must have those privileges themselves first. + ::: -**Granting or Revoking Privileges** - -The `ALTER` hierarchy: - -```response -├── ALTER (only for table and view)/ -│ ├── ALTER TABLE/ -│ │ ├── ALTER UPDATE -│ │ ├── ALTER DELETE -│ │ ├── ALTER COLUMN/ -│ │ │ ├── ALTER ADD COLUMN -│ │ │ ├── ALTER DROP COLUMN -│ │ │ ├── ALTER MODIFY COLUMN -│ │ │ ├── ALTER COMMENT COLUMN -│ │ │ ├── ALTER CLEAR COLUMN -│ │ │ └── ALTER RENAME COLUMN -│ │ ├── ALTER INDEX/ -│ │ │ ├── ALTER ORDER BY -│ │ │ ├── ALTER SAMPLE BY -│ │ │ ├── ALTER ADD INDEX -│ │ │ ├── ALTER DROP INDEX -│ │ │ ├── ALTER MATERIALIZE INDEX -│ │ │ └── ALTER CLEAR INDEX -│ │ ├── ALTER CONSTRAINT/ -│ │ │ ├── ALTER ADD CONSTRAINT -│ │ │ └── ALTER DROP CONSTRAINT -│ │ ├── ALTER TTL/ -│ │ │ └── ALTER MATERIALIZE TTL -│ │ ├── ALTER SETTINGS -│ │ ├── ALTER MOVE PARTITION -│ │ ├── ALTER FETCH PARTITION -│ │ └── ALTER FREEZE PARTITION -│ └── ALTER LIVE VIEW/ -│ ├── ALTER LIVE VIEW REFRESH -│ └── ALTER LIVE VIEW MODIFY QUERY -├── ALTER DATABASE -├── ALTER USER -├── ALTER ROLE -├── ALTER QUOTA -├── ALTER [ROW] POLICY -└── ALTER [SETTINGS] PROFILE -``` + **Granting or Revoking Privileges** + + The `ALTER` hierarchy: + + ```response + ├── ALTER (only for table and view)/ + │ ├── ALTER TABLE/ + │ │ ├── ALTER UPDATE + │ │ ├── ALTER DELETE + │ │ ├── ALTER COLUMN/ + │ │ │ ├── ALTER ADD COLUMN + │ │ │ ├── ALTER DROP COLUMN + │ │ │ ├── ALTER MODIFY COLUMN + │ │ │ ├── ALTER COMMENT COLUMN + │ │ │ ├── ALTER CLEAR COLUMN + │ │ │ └── ALTER RENAME COLUMN + │ │ ├── ALTER INDEX/ + │ │ │ ├── ALTER ORDER BY + │ │ │ ├── ALTER SAMPLE BY + │ │ │ ├── ALTER ADD INDEX + │ │ │ ├── ALTER DROP INDEX + │ │ │ ├── ALTER MATERIALIZE INDEX + │ │ │ └── ALTER CLEAR INDEX + │ │ ├── ALTER CONSTRAINT/ + │ │ │ ├── ALTER ADD CONSTRAINT + │ │ │ └── ALTER DROP CONSTRAINT + │ │ ├── ALTER TTL/ + │ │ │ └── ALTER MATERIALIZE TTL + │ │ ├── ALTER SETTINGS + │ │ ├── ALTER MOVE PARTITION + │ │ ├── ALTER FETCH PARTITION + │ │ └── ALTER FREEZE PARTITION + │ └── ALTER LIVE VIEW/ + │ ├── ALTER LIVE VIEW REFRESH + │ └── ALTER LIVE VIEW MODIFY QUERY + ├── ALTER DATABASE + ├── ALTER USER + ├── ALTER ROLE + ├── ALTER QUOTA + ├── ALTER [ROW] POLICY + └── ALTER [SETTINGS] PROFILE + ``` 1. Granting `ALTER` Privileges to a User or Role -Using an `GRANT ALTER on *.* TO my_user` will only affect top-level `ALTER TABLE` and `ALTER VIEW` , other `ALTER` statements must be individually granted or revoked. + Using an `GRANT ALTER on *.* TO my_user` will only affect top-level `ALTER TABLE` and `ALTER VIEW` , other `ALTER` statements must be individually granted or revoked. -for example, granting basic `ALTER` privilege: + for example, granting basic `ALTER` privilege: -```sql -GRANT ALTER ON my_db.my_table TO my_user; -``` + ```sql + GRANT ALTER ON my_db.my_table TO my_user; + ``` -Resulting set of privileges: + Resulting set of privileges: -```sql -SHOW GRANTS FOR my_user; -``` + ```sql + SHOW GRANTS FOR my_user; + ``` -```response -SHOW GRANTS FOR my_user + ```response + SHOW GRANTS FOR my_user -Query id: 706befbc-525e-4ec1-a1a2-ba2508cc09e3 + Query id: 706befbc-525e-4ec1-a1a2-ba2508cc09e3 -┌─GRANTS FOR my_user───────────────────────────────────────────┐ -│ GRANT ALTER TABLE, ALTER VIEW ON my_db.my_table TO my_user │ -└──────────────────────────────────────────────────────────────┘ -``` + ┌─GRANTS FOR my_user───────────────────────────────────────────┐ + │ GRANT ALTER TABLE, ALTER VIEW ON my_db.my_table TO my_user │ + └──────────────────────────────────────────────────────────────┘ + ``` -This will grant all permissions under `ALTER TABLE` and `ALTER VIEW` from the example above, however, it will not grant certain other `ALTER` permissions such as `ALTER ROW POLICY` (Refer back to the hierarchy and you will see that `ALTER ROW POLICY` is not a child of `ALTER TABLE` or `ALTER VIEW`). Those must be explicitly granted or revoked. + This will grant all permissions under `ALTER TABLE` and `ALTER VIEW` from the example above, however, it will not grant certain other `ALTER` permissions such as `ALTER ROW POLICY` (Refer back to the hierarchy and you will see that `ALTER ROW POLICY` is not a child of `ALTER TABLE` or `ALTER VIEW`). Those must be explicitly granted or revoked. -If only a subset of `ALTER` permissions is needed then each can be granted separately, if there are sub-privileges to that permission then those would be automatically granted also. + If only a subset of `ALTER` permissions is needed then each can be granted separately, if there are sub-privileges to that permission then those would be automatically granted also. -For example: + For example: -```sql -GRANT ALTER COLUMN ON my_db.my_table TO my_user; -``` + ```sql + GRANT ALTER COLUMN ON my_db.my_table TO my_user; + ``` -Grants would be set as: + Grants would be set as: -```sql -SHOW GRANTS FOR my_user; -``` + ```sql + SHOW GRANTS FOR my_user; + ``` -```response -SHOW GRANTS FOR my_user + ```response + SHOW GRANTS FOR my_user -Query id: 47b3d03f-46ac-4385-91ec-41119010e4e2 + Query id: 47b3d03f-46ac-4385-91ec-41119010e4e2 -┌─GRANTS FOR my_user────────────────────────────────┐ -│ GRANT ALTER COLUMN ON default.my_table TO my_user │ -└───────────────────────────────────────────────────┘ + ┌─GRANTS FOR my_user────────────────────────────────┐ + │ GRANT ALTER COLUMN ON default.my_table TO my_user │ + └───────────────────────────────────────────────────┘ -1 row in set. Elapsed: 0.004 sec. -``` + 1 row in set. Elapsed: 0.004 sec. + ``` -This also gives the following sub-privileges: + This also gives the following sub-privileges: -```sql -ALTER ADD COLUMN -ALTER DROP COLUMN -ALTER MODIFY COLUMN -ALTER COMMENT COLUMN -ALTER CLEAR COLUMN -ALTER RENAME COLUMN -``` + ```sql + ALTER ADD COLUMN + ALTER DROP COLUMN + ALTER MODIFY COLUMN + ALTER COMMENT COLUMN + ALTER CLEAR COLUMN + ALTER RENAME COLUMN + ``` 2. Revoking `ALTER` privileges from Users and Roles -The `REVOKE` statement works similarly to the `GRANT` statement. + The `REVOKE` statement works similarly to the `GRANT` statement. -If a user/role was granted a sub-privilege, you can either revoke that sub-privilege directly or revoke the higher-level privilege it inherits from. + If a user/role was granted a sub-privilege, you can either revoke that sub-privilege directly or revoke the higher-level privilege it inherits from. -For example, if the user was granted `ALTER ADD COLUMN` + For example, if the user was granted `ALTER ADD COLUMN` -```sql -GRANT ALTER ADD COLUMN ON my_db.my_table TO my_user; -``` + ```sql + GRANT ALTER ADD COLUMN ON my_db.my_table TO my_user; + ``` -```response -GRANT ALTER ADD COLUMN ON my_db.my_table TO my_user + ```response + GRANT ALTER ADD COLUMN ON my_db.my_table TO my_user -Query id: 61fe0fdc-1442-4cd6-b2f3-e8f2a853c739 + Query id: 61fe0fdc-1442-4cd6-b2f3-e8f2a853c739 -Ok. + Ok. -0 rows in set. Elapsed: 0.002 sec. -``` + 0 rows in set. Elapsed: 0.002 sec. + ``` -```sql -SHOW GRANTS FOR my_user; -``` + ```sql + SHOW GRANTS FOR my_user; + ``` -```response -SHOW GRANTS FOR my_user + ```response + SHOW GRANTS FOR my_user -Query id: 27791226-a18f-46c8-b2b4-a9e64baeb683 + Query id: 27791226-a18f-46c8-b2b4-a9e64baeb683 -┌─GRANTS FOR my_user──────────────────────────────────┐ -│ GRANT ALTER ADD COLUMN ON my_db.my_table TO my_user │ -└─────────────────────────────────────────────────────┘ -``` + ┌─GRANTS FOR my_user──────────────────────────────────┐ + │ GRANT ALTER ADD COLUMN ON my_db.my_table TO my_user │ + └─────────────────────────────────────────────────────┘ + ``` -A privilege can be revoked individually: + A privilege can be revoked individually: -```sql -REVOKE ALTER ADD COLUMN ON my_db.my_table FROM my_user; -``` + ```sql + REVOKE ALTER ADD COLUMN ON my_db.my_table FROM my_user; + ``` -Or can be revoked from any of the upper levels (revoke all of the COLUMN sub privileges): + Or can be revoked from any of the upper levels (revoke all of the COLUMN sub privileges): -```response -REVOKE ALTER COLUMN ON my_db.my_table FROM my_user; -``` + ```response + REVOKE ALTER COLUMN ON my_db.my_table FROM my_user; + ``` -```response -REVOKE ALTER COLUMN ON my_db.my_table FROM my_user + ```response + REVOKE ALTER COLUMN ON my_db.my_table FROM my_user -Query id: b882ba1b-90fb-45b9-b10f-3cda251e2ccc + Query id: b882ba1b-90fb-45b9-b10f-3cda251e2ccc -Ok. + Ok. -0 rows in set. Elapsed: 0.002 sec. -``` + 0 rows in set. Elapsed: 0.002 sec. + ``` -```sql -SHOW GRANTS FOR my_user; -``` + ```sql + SHOW GRANTS FOR my_user; + ``` -```response -SHOW GRANTS FOR my_user + ```response + SHOW GRANTS FOR my_user -Query id: e7d341de-de65-490b-852c-fa8bb8991174 + Query id: e7d341de-de65-490b-852c-fa8bb8991174 -Ok. + Ok. -0 rows in set. Elapsed: 0.003 sec. -``` + 0 rows in set. Elapsed: 0.003 sec. + ``` -**Additional** + **Additional** -The privileges must be granted by a user that not only has the `WITH GRANT OPTION` but also has the privileges themselves. + The privileges must be granted by a user that not only has the `WITH GRANT OPTION` but also has the privileges themselves. 1. To grant an admin user the privilege and also allow them to administer a set of privileges -Below is an example: + Below is an example: -```sql -GRANT SELECT, ALTER COLUMN ON my_db.my_table TO my_alter_admin WITH GRANT OPTION; -``` + ```sql + GRANT SELECT, ALTER COLUMN ON my_db.my_table TO my_alter_admin WITH GRANT OPTION; + ``` -Now the user can grant or revoke `ALTER COLUMN` and all sub-privileges. + Now the user can grant or revoke `ALTER COLUMN` and all sub-privileges. -**Testing** + **Testing** 1. Add the `SELECT` privilege -```sql - GRANT SELECT ON my_db.my_table TO my_user; -``` + ```sql + GRANT SELECT ON my_db.my_table TO my_user; + ``` 2. Add the add column privilege to the user -```sql -GRANT ADD COLUMN ON my_db.my_table TO my_user; -``` + ```sql + GRANT ADD COLUMN ON my_db.my_table TO my_user; + ``` 3. Log in with the restricted user -```bash -clickhouse-client --user my_user --password password --port 9000 --host -``` + ```bash + clickhouse-client --user my_user --password password --port 9000 --host + ``` 4. Test adding a column -```sql -ALTER TABLE my_db.my_table ADD COLUMN column2 String; -``` + ```sql + ALTER TABLE my_db.my_table ADD COLUMN column2 String; + ``` -```response -ALTER TABLE my_db.my_table + ```response + ALTER TABLE my_db.my_table ADD COLUMN `column2` String -Query id: d5d6bfa1-b80c-4d9f-8dcd-d13e7bd401a5 + Query id: d5d6bfa1-b80c-4d9f-8dcd-d13e7bd401a5 -Ok. + Ok. -0 rows in set. Elapsed: 0.010 sec. -``` + 0 rows in set. Elapsed: 0.010 sec. + ``` -```sql -DESCRIBE my_db.my_table; -``` + ```sql + DESCRIBE my_db.my_table; + ``` -```response -DESCRIBE TABLE my_db.my_table + ```response + DESCRIBE TABLE my_db.my_table -Query id: ab9cb2d0-5b1a-42e1-bc9c-c7ff351cb272 + Query id: ab9cb2d0-5b1a-42e1-bc9c-c7ff351cb272 -┌─name────┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ -│ id │ UInt64 │ │ │ │ │ │ -│ column1 │ String │ │ │ │ │ │ -│ column2 │ String │ │ │ │ │ │ -└─────────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ -``` + ┌─name────┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ + │ id │ UInt64 │ │ │ │ │ │ + │ column1 │ String │ │ │ │ │ │ + │ column2 │ String │ │ │ │ │ │ + └─────────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ + ``` 4. Test deleting a column -```sql -ALTER TABLE my_db.my_table DROP COLUMN column2; -``` + ```sql + ALTER TABLE my_db.my_table DROP COLUMN column2; + ``` -```response -ALTER TABLE my_db.my_table + ```response + ALTER TABLE my_db.my_table DROP COLUMN column2 -Query id: 50ad5f6b-f64b-4c96-8f5f-ace87cea6c47 - + Query id: 50ad5f6b-f64b-4c96-8f5f-ace87cea6c47 -0 rows in set. Elapsed: 0.004 sec. + 0 rows in set. Elapsed: 0.004 sec. -Received exception from server (version 22.5.1): -Code: 497. DB::Exception: Received from chnode1.marsnet.local:9440. DB::Exception: my_user: Not enough privileges. To execute this query it's necessary to have grant ALTER DROP COLUMN(column2) ON my_db.my_table. (ACCESS_DENIED) -``` + Received exception from server (version 22.5.1): + Code: 497. DB::Exception: Received from chnode1.marsnet.local:9440. DB::Exception: my_user: Not enough privileges. To execute this query it's necessary to have grant ALTER DROP COLUMN(column2) ON my_db.my_table. (ACCESS_DENIED) + ``` 5. Testing the alter admin by granting the permission -```sql -GRANT SELECT, ALTER COLUMN ON my_db.my_table TO my_alter_admin WITH GRANT OPTION; -``` + ```sql + GRANT SELECT, ALTER COLUMN ON my_db.my_table TO my_alter_admin WITH GRANT OPTION; + ``` 6. Log in with the alter admin user -```bash -clickhouse-client --user my_alter_admin --password password --port 9000 --host -``` + ```bash + clickhouse-client --user my_alter_admin --password password --port 9000 --host + ``` 7. Grant a sub-privilege -```sql -GRANT ALTER ADD COLUMN ON my_db.my_table TO my_user; -``` + ```sql + GRANT ALTER ADD COLUMN ON my_db.my_table TO my_user; + ``` -```response -GRANT ALTER ADD COLUMN ON my_db.my_table TO my_user + ```response + GRANT ALTER ADD COLUMN ON my_db.my_table TO my_user -Query id: 1c7622fa-9df1-4c54-9fc3-f984c716aeba + Query id: 1c7622fa-9df1-4c54-9fc3-f984c716aeba -Ok. -``` + Ok. + ``` 8. Test granting a privilege that the alter admin user does not have is not a sub privilege of the grants for the admin user. -```sql -GRANT ALTER UPDATE ON my_db.my_table TO my_user; -``` - -```response -GRANT ALTER UPDATE ON my_db.my_table TO my_user + ```sql + GRANT ALTER UPDATE ON my_db.my_table TO my_user; + ``` -Query id: 191690dc-55a6-4625-8fee-abc3d14a5545 + ```response + GRANT ALTER UPDATE ON my_db.my_table TO my_user + Query id: 191690dc-55a6-4625-8fee-abc3d14a5545 -0 rows in set. Elapsed: 0.004 sec. + 0 rows in set. Elapsed: 0.004 sec. -Received exception from server (version 22.5.1): -Code: 497. DB::Exception: Received from chnode1.marsnet.local:9440. DB::Exception: my_alter_admin: Not enough privileges. To execute this query it's necessary to have grant ALTER UPDATE ON my_db.my_table WITH GRANT OPTION. (ACCESS_DENIED) -``` + Received exception from server (version 22.5.1): + Code: 497. DB::Exception: Received from chnode1.marsnet.local:9440. DB::Exception: my_alter_admin: Not enough privileges. To execute this query it's necessary to have grant ALTER UPDATE ON my_db.my_table WITH GRANT OPTION. (ACCESS_DENIED) + ``` -**Summary** -The ALTER privileges are hierarchical for `ALTER` with tables and views but not for other `ALTER` statements. The permissions can be set in granular level or by grouping of permissions and also revoked similarly. The user granting or revoking must have `WITH GRANT OPTION` to set privileges on users, including the acting user themselves, and must have the privilege already. The acting user cannot revoke their own privileges if they do not have the grant option privilege themselves. + **Summary** + The ALTER privileges are hierarchical for `ALTER` with tables and views but not for other `ALTER` statements. The permissions can be set in granular level or by grouping of permissions and also revoked similarly. The user granting or revoking must have `WITH GRANT OPTION` to set privileges on users, including the acting user themselves, and must have the privilege already. The acting user cannot revoke their own privileges if they do not have the grant option privilege themselves. diff --git a/docs/guides/sre/user-management/ssl-user-auth.md b/docs/guides/sre/user-management/ssl-user-auth.md index c7135ff5261..1848e4bfc5c 100644 --- a/docs/guides/sre/user-management/ssl-user-auth.md +++ b/docs/guides/sre/user-management/ssl-user-auth.md @@ -29,7 +29,6 @@ If you use AWS NLB with the MySQL interface, you have to ask AWS support to enab This example uses self-signed certificates with a self-signed CA. For production environments, create a CSR and submit to your PKI team or certificate provider to obtain a proper certificate. ::: - 1. Generate a Certificate Signing Request (CSR) and key. The basic format is the following: ```bash openssl req -newkey rsa:2048 -nodes -subj "/CN=:" -keyout .key -out .csr @@ -42,7 +41,7 @@ This example uses self-signed certificates with a self-signed CA. For production The CN is arbitrary and any string can be used as an identifier for the certificate. It is used when creating the user in the following steps. ::: -2. Generate and sign the new user certificate that will be used for authentication. The basic format is the following: +2. Generate and sign the new user certificate that will be used for authentication. The basic format is the following: ```bash openssl x509 -req -in .csr -out .crt -CA .crt -CAkey .key -days 365 ``` @@ -89,7 +88,6 @@ For details on how to enable SQL users and set roles, refer to [Defining SQL Use ``` ::: - ## 3. Testing {#3-testing} 1. Copy the user certificate, user key and CA certificate to a remote node. @@ -114,7 +112,6 @@ For details on how to enable SQL users and set roles, refer to [Defining SQL Use Note that the password passed to clickhouse-client is ignored when a certificate is specified in the config. ::: - ## 4. Testing HTTP {#4-testing-http} 1. Copy the user certificate, user key and CA certificate to a remote node. @@ -138,7 +135,6 @@ For details on how to enable SQL users and set roles, refer to [Defining SQL Use Notice that no password was specified, the certificate is used in lieu of a password and is how ClickHouse will authenticate the user. ::: - ## Summary {#summary} This article showed the basics of creating and configuring a user for SSL certificate authentication. This method can be used with `clickhouse-client` or any clients which support the `https` interface and where HTTP headers can be set. The generated certificate and key should be kept private and with limited access since the certificate is used to authenticate and authorize the user for operations on the ClickHouse database. Treat the certificate and key as if they were passwords. diff --git a/docs/guides/troubleshooting.md b/docs/guides/troubleshooting.md index 65cd5ddfeaa..5efbb6406c0 100644 --- a/docs/guides/troubleshooting.md +++ b/docs/guides/troubleshooting.md @@ -14,9 +14,9 @@ The `apt-key` feature with the [Advanced package tool (APT) has been deprecated] 1. See if your `gpg` is installed: -```shell -sudo apt-get install gnupg -``` + ```shell + sudo apt-get install gnupg + ``` ### Cannot get deb packages from ClickHouse repository with apt-get {#cannot-get-deb-packages-from-clickhouse-repository-with-apt-get} @@ -104,33 +104,33 @@ If the server started successfully, you should see the strings: - ` Application: starting up.` — Server started. - ` Application: Ready for connections.` — Server is running and ready for connections. -If `clickhouse-server` start failed with a configuration error, you should see the `` string with an error description. For example: + If `clickhouse-server` start failed with a configuration error, you should see the `` string with an error description. For example: -```plaintext -2019.01.11 15:23:25.549505 [ 45 ] {} ExternalDictionaries: Failed reloading 'event2id' external dictionary: Poco::Exception. Code: 1000, e.code() = 111, e.displayText() = Connection refused, e.what() = Connection refused -``` + ```plaintext + 2019.01.11 15:23:25.549505 [ 45 ] {} ExternalDictionaries: Failed reloading 'event2id' external dictionary: Poco::Exception. Code: 1000, e.code() = 111, e.displayText() = Connection refused, e.what() = Connection refused + ``` -If you do not see an error at the end of the file, look through the entire file starting from the string: + If you do not see an error at the end of the file, look through the entire file starting from the string: -```plaintext - Application: starting up. -``` + ```plaintext + Application: starting up. + ``` -If you try to start a second instance of `clickhouse-server` on the server, you see the following log: + If you try to start a second instance of `clickhouse-server` on the server, you see the following log: -```plaintext -2019.01.11 15:25:11.151730 [ 1 ] {} : Starting ClickHouse 19.1.0 with revision 54413 -2019.01.11 15:25:11.154578 [ 1 ] {} Application: starting up -2019.01.11 15:25:11.156361 [ 1 ] {} StatusFile: Status file ./status already exists - unclean restart. Contents: -PID: 8510 -Started at: 2019-01-11 15:24:23 -Revision: 54413 + ```plaintext + 2019.01.11 15:25:11.151730 [ 1 ] {} : Starting ClickHouse 19.1.0 with revision 54413 + 2019.01.11 15:25:11.154578 [ 1 ] {} Application: starting up + 2019.01.11 15:25:11.156361 [ 1 ] {} StatusFile: Status file ./status already exists - unclean restart. Contents: + PID: 8510 + Started at: 2019-01-11 15:24:23 + Revision: 54413 -2019.01.11 15:25:11.156673 [ 1 ] {} Application: DB::Exception: Cannot lock file ./status. Another server instance in same directory is already running. -2019.01.11 15:25:11.156682 [ 1 ] {} Application: shutting down -2019.01.11 15:25:11.156686 [ 1 ] {} Application: Uninitializing subsystem: Logging Subsystem -2019.01.11 15:25:11.156716 [ 2 ] {} BaseDaemon: Stop SignalListener thread -``` + 2019.01.11 15:25:11.156673 [ 1 ] {} Application: DB::Exception: Cannot lock file ./status. Another server instance in same directory is already running. + 2019.01.11 15:25:11.156682 [ 1 ] {} Application: shutting down + 2019.01.11 15:25:11.156686 [ 1 ] {} Application: Uninitializing subsystem: Logging Subsystem + 2019.01.11 15:25:11.156716 [ 2 ] {} BaseDaemon: Stop SignalListener thread + ``` #### See system.d logs {#see-systemd-logs} diff --git a/docs/integrations/data-ingestion/apache-spark/index.md b/docs/integrations/data-ingestion/apache-spark/index.md index 76f624ab537..de2916137b1 100644 --- a/docs/integrations/data-ingestion/apache-spark/index.md +++ b/docs/integrations/data-ingestion/apache-spark/index.md @@ -21,11 +21,8 @@ science, and machine learning on single-node machines or clusters. There are two main ways to connect Apache Spark and ClickHouse: 1. [Spark Connector](./apache-spark/spark-native-connector) - The Spark connector implements the `DataSourceV2` and has its own Catalog - management. As of today, this is the recommended way to integrate ClickHouse and Spark. + management. As of today, this is the recommended way to integrate ClickHouse and Spark. 2. [Spark JDBC](./apache-spark/spark-jdbc) - Integrate Spark and ClickHouse - using a [JDBC data source](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html). - -
-
-Both solutions have been successfully tested and are fully compatible with various APIs, including Java, Scala, PySpark, and Spark SQL. + using a [JDBC data source](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html). + Both solutions have been successfully tested and are fully compatible with various APIs, including Java, Scala, PySpark, and Spark SQL. diff --git a/docs/integrations/data-ingestion/apache-spark/spark-jdbc.md b/docs/integrations/data-ingestion/apache-spark/spark-jdbc.md index 746126aeacf..86e0bc4f1f7 100644 --- a/docs/integrations/data-ingestion/apache-spark/spark-jdbc.md +++ b/docs/integrations/data-ingestion/apache-spark/spark-jdbc.md @@ -22,143 +22,108 @@ use the [ClickHouse official JDBC connector](/integrations/language-clients/java - ```java public static void main(String[] args) { - // Initialize Spark session - SparkSession spark = SparkSession.builder().appName("example").master("local").getOrCreate(); - - String jdbcURL = "jdbc:ch://localhost:8123/default"; - String query = "select * from example_table where id > 2"; - - - //--------------------------------------------------------------------------------------------------- - // Load the table from ClickHouse using jdbc method - //--------------------------------------------------------------------------------------------------- - Properties jdbcProperties = new Properties(); - jdbcProperties.put("user", "default"); - jdbcProperties.put("password", "123456"); - - Dataset df1 = spark.read().jdbc(jdbcURL, String.format("(%s)", query), jdbcProperties); - - df1.show(); - - //--------------------------------------------------------------------------------------------------- - // Load the table from ClickHouse using load method - //--------------------------------------------------------------------------------------------------- - Dataset df2 = spark.read() - .format("jdbc") - .option("url", jdbcURL) - .option("user", "default") - .option("password", "123456") - .option("query", query) - .load(); - - - df2.show(); - - - // Stop the Spark session - spark.stop(); - } +// Initialize Spark session +SparkSession spark = SparkSession.builder().appName("example").master("local").getOrCreate(); +String jdbcURL = "jdbc:ch://localhost:8123/default"; +String query = "select * from example_table where id > 2"; +//--------------------------------------------------------------------------------------------------- +// Load the table from ClickHouse using jdbc method +//--------------------------------------------------------------------------------------------------- +Properties jdbcProperties = new Properties(); +jdbcProperties.put("user", "default"); +jdbcProperties.put("password", "123456"); +Dataset df1 = spark.read().jdbc(jdbcURL, String.format("(%s)", query), jdbcProperties); +df1.show(); +//--------------------------------------------------------------------------------------------------- +// Load the table from ClickHouse using load method +//--------------------------------------------------------------------------------------------------- +Dataset df2 = spark.read() +.format("jdbc") +.option("url", jdbcURL) +.option("user", "default") +.option("password", "123456") +.option("query", query) +.load(); +df2.show(); +// Stop the Spark session +spark.stop(); +} ``` - - ```java object ReadData extends App { - // Initialize Spark session - val spark: SparkSession = SparkSession.builder.appName("example").master("local").getOrCreate - - val jdbcURL = "jdbc:ch://localhost:8123/default" - val query: String = "select * from example_table where id > 2" - - - //--------------------------------------------------------------------------------------------------- - // Load the table from ClickHouse using jdbc method - //--------------------------------------------------------------------------------------------------- - val connectionProperties = new Properties() - connectionProperties.put("user", "default") - connectionProperties.put("password", "123456") - - val df1: Dataset[Row] = spark.read. - jdbc(jdbcURL, s"($query)", connectionProperties) - - df1.show() - //--------------------------------------------------------------------------------------------------- - // Load the table from ClickHouse using load method - //--------------------------------------------------------------------------------------------------- - val df2: Dataset[Row] = spark.read - .format("jdbc") - .option("url", jdbcURL) - .option("user", "default") - .option("password", "123456") - .option("query", query) - .load() - - df2.show() - - - - // Stop the Spark session// Stop the Spark session - spark.stop() - +// Initialize Spark session +val spark: SparkSession = SparkSession.builder.appName("example").master("local").getOrCreate +val jdbcURL = "jdbc:ch://localhost:8123/default" +val query: String = "select * from example_table where id > 2" +//--------------------------------------------------------------------------------------------------- +// Load the table from ClickHouse using jdbc method +//--------------------------------------------------------------------------------------------------- +val connectionProperties = new Properties() +connectionProperties.put("user", "default") +connectionProperties.put("password", "123456") +val df1: Dataset[Row] = spark.read. +jdbc(jdbcURL, s"($query)", connectionProperties) +df1.show() +//--------------------------------------------------------------------------------------------------- +// Load the table from ClickHouse using load method +//--------------------------------------------------------------------------------------------------- +val df2: Dataset[Row] = spark.read +.format("jdbc") +.option("url", jdbcURL) +.option("user", "default") +.option("password", "123456") +.option("query", query) +.load() +df2.show() +// Stop the Spark session// Stop the Spark session +spark.stop() } ``` - - ```python from pyspark.sql import SparkSession - jar_files = [ - "jars/clickhouse-jdbc-X.X.X-SNAPSHOT-all.jar" +"jars/clickhouse-jdbc-X.X.X-SNAPSHOT-all.jar" ] - # Initialize Spark session with JARs spark = SparkSession.builder \ - .appName("example") \ - .master("local") \ - .config("spark.jars", ",".join(jar_files)) \ - .getOrCreate() - +.appName("example") \ +.master("local") \ +.config("spark.jars", ",".join(jar_files)) \ +.getOrCreate() url = "jdbc:ch://localhost:8123/default" -user = "your_user" -password = "your_password" +user = "your_user" +password = "your_password" query = "select * from example_table where id > 2" driver = "com.clickhouse.jdbc.ClickHouseDriver" - df = (spark.read - .format('jdbc') - .option('driver', driver) - .option('url', url) - .option('user', user) - .option('password', password).option( - 'query', query).load()) - +.format('jdbc') +.option('driver', driver) +.option('url', url) +.option('user', user) +.option('password', password).option( +'query', query).load()) df.show() - ``` - - ```sql - CREATE TEMPORARY VIEW jdbcTable - USING org.apache.spark.sql.jdbc - OPTIONS ( - url "jdbc:ch://localhost:8123/default", - dbtable "schema.tablename", - user "username", - password "password", - driver "com.clickhouse.jdbc.ClickHouseDriver" - ); - - SELECT * FROM jdbcTable; +CREATE TEMPORARY VIEW jdbcTable +USING org.apache.spark.sql.jdbc +OPTIONS ( +url "jdbc:ch://localhost:8123/default", +dbtable "schema.tablename", +user "username", +password "password", +driver "com.clickhouse.jdbc.ClickHouseDriver" +); +SELECT * FROM jdbcTable; ``` - @@ -166,179 +131,137 @@ df.show() - ```java - public static void main(String[] args) { - // Initialize Spark session - SparkSession spark = SparkSession.builder().appName("example").master("local").getOrCreate(); - - // JDBC connection details - String jdbcUrl = "jdbc:ch://localhost:8123/default"; - Properties jdbcProperties = new Properties(); - jdbcProperties.put("user", "default"); - jdbcProperties.put("password", "123456"); - - // Create a sample DataFrame - StructType schema = new StructType(new StructField[]{ - DataTypes.createStructField("id", DataTypes.IntegerType, false), - DataTypes.createStructField("name", DataTypes.StringType, false) - }); - - List rows = new ArrayList(); - rows.add(RowFactory.create(1, "John")); - rows.add(RowFactory.create(2, "Doe")); - - - Dataset df = spark.createDataFrame(rows, schema); - - //--------------------------------------------------------------------------------------------------- - // Write the df to ClickHouse using the jdbc method - //--------------------------------------------------------------------------------------------------- - - df.write() - .mode(SaveMode.Append) - .jdbc(jdbcUrl, "example_table", jdbcProperties); - - //--------------------------------------------------------------------------------------------------- - // Write the df to ClickHouse using the save method - //--------------------------------------------------------------------------------------------------- - - df.write() - .format("jdbc") - .mode("append") - .option("url", jdbcUrl) - .option("dbtable", "example_table") - .option("user", "default") - .option("password", "123456") - .save(); - - - // Stop the Spark session - spark.stop(); - } +public static void main(String[] args) { +// Initialize Spark session +SparkSession spark = SparkSession.builder().appName("example").master("local").getOrCreate(); +// JDBC connection details +String jdbcUrl = "jdbc:ch://localhost:8123/default"; +Properties jdbcProperties = new Properties(); +jdbcProperties.put("user", "default"); +jdbcProperties.put("password", "123456"); +// Create a sample DataFrame +StructType schema = new StructType(new StructField[]{ +DataTypes.createStructField("id", DataTypes.IntegerType, false), +DataTypes.createStructField("name", DataTypes.StringType, false) +}); +List rows = new ArrayList(); +rows.add(RowFactory.create(1, "John")); +rows.add(RowFactory.create(2, "Doe")); +Dataset df = spark.createDataFrame(rows, schema); +//--------------------------------------------------------------------------------------------------- +// Write the df to ClickHouse using the jdbc method +//--------------------------------------------------------------------------------------------------- +df.write() +.mode(SaveMode.Append) +.jdbc(jdbcUrl, "example_table", jdbcProperties); +//--------------------------------------------------------------------------------------------------- +// Write the df to ClickHouse using the save method +//--------------------------------------------------------------------------------------------------- +df.write() +.format("jdbc") +.mode("append") +.option("url", jdbcUrl) +.option("dbtable", "example_table") +.option("user", "default") +.option("password", "123456") +.save(); +// Stop the Spark session +spark.stop(); +} ``` - - ```java object WriteData extends App { - - val spark: SparkSession = SparkSession.builder.appName("example").master("local").getOrCreate - - // JDBC connection details - val jdbcUrl: String = "jdbc:ch://localhost:8123/default" - val jdbcProperties: Properties = new Properties - jdbcProperties.put("user", "default") - jdbcProperties.put("password", "123456") - - // Create a sample DataFrame - - - val rows = Seq(Row(1, "John"), Row(2, "Doe")) - - val schema = List( - StructField("id", DataTypes.IntegerType, nullable = false), - StructField("name", StringType, nullable = true) - ) - - val df: DataFrame = spark.createDataFrame( - spark.sparkContext.parallelize(rows), - StructType(schema) - ) - - //---------------------------------------------------------------------------------------------------//--------------------------------------------------------------------------------------------------- - // Write the df to ClickHouse using the jdbc method - //---------------------------------------------------------------------------------------------------//--------------------------------------------------------------------------------------------------- - - df.write - .mode(SaveMode.Append) - .jdbc(jdbcUrl, "example_table", jdbcProperties) - - //---------------------------------------------------------------------------------------------------//--------------------------------------------------------------------------------------------------- - // Write the df to ClickHouse using the save method - //---------------------------------------------------------------------------------------------------//--------------------------------------------------------------------------------------------------- - - df.write - .format("jdbc") - .mode("append") - .option("url", jdbcUrl) - .option("dbtable", "example_table") - .option("user", "default") - .option("password", "123456") - .save() - - - // Stop the Spark session// Stop the Spark session - spark.stop() - +val spark: SparkSession = SparkSession.builder.appName("example").master("local").getOrCreate +// JDBC connection details +val jdbcUrl: String = "jdbc:ch://localhost:8123/default" +val jdbcProperties: Properties = new Properties +jdbcProperties.put("user", "default") +jdbcProperties.put("password", "123456") +// Create a sample DataFrame +val rows = Seq(Row(1, "John"), Row(2, "Doe")) +val schema = List( +StructField("id", DataTypes.IntegerType, nullable = false), +StructField("name", StringType, nullable = true) +) +val df: DataFrame = spark.createDataFrame( +spark.sparkContext.parallelize(rows), +StructType(schema) +) +//---------------------------------------------------------------------------------------------------//--------------------------------------------------------------------------------------------------- +// Write the df to ClickHouse using the jdbc method +//---------------------------------------------------------------------------------------------------//--------------------------------------------------------------------------------------------------- +df.write +.mode(SaveMode.Append) +.jdbc(jdbcUrl, "example_table", jdbcProperties) +//---------------------------------------------------------------------------------------------------//--------------------------------------------------------------------------------------------------- +// Write the df to ClickHouse using the save method +//---------------------------------------------------------------------------------------------------//--------------------------------------------------------------------------------------------------- +df.write +.format("jdbc") +.mode("append") +.option("url", jdbcUrl) +.option("dbtable", "example_table") +.option("user", "default") +.option("password", "123456") +.save() +// Stop the Spark session// Stop the Spark session +spark.stop() } ``` - - ```python from pyspark.sql import SparkSession from pyspark.sql import Row - jar_files = [ - "jars/clickhouse-jdbc-X.X.X-SNAPSHOT-all.jar" +"jars/clickhouse-jdbc-X.X.X-SNAPSHOT-all.jar" ] - # Initialize Spark session with JARs spark = SparkSession.builder \ - .appName("example") \ - .master("local") \ - .config("spark.jars", ",".join(jar_files)) \ - .getOrCreate() - +.appName("example") \ +.master("local") \ +.config("spark.jars", ",".join(jar_files)) \ +.getOrCreate() # Create DataFrame data = [Row(id=11, name="John"), Row(id=12, name="Doe")] df = spark.createDataFrame(data) - url = "jdbc:ch://localhost:8123/default" -user = "your_user" -password = "your_password" +user = "your_user" +password = "your_password" driver = "com.clickhouse.jdbc.ClickHouseDriver" - # Write DataFrame to ClickHouse df.write \ - .format("jdbc") \ - .option("driver", driver) \ - .option("url", url) \ - .option("user", user) \ - .option("password", password) \ - .option("dbtable", "example_table") \ - .mode("append") \ - .save() - - +.format("jdbc") \ +.option("driver", driver) \ +.option("url", url) \ +.option("user", user) \ +.option("password", password) \ +.option("dbtable", "example_table") \ +.mode("append") \ +.save() ``` - - ```sql - CREATE TEMPORARY VIEW jdbcTable - USING org.apache.spark.sql.jdbc - OPTIONS ( - url "jdbc:ch://localhost:8123/default", - dbtable "schema.tablename", - user "username", - password "password", - driver "com.clickhouse.jdbc.ClickHouseDriver" - ); - -- resultTable could be created with df.createTempView or with Spark SQL - INSERT INTO TABLE jdbcTable - SELECT * FROM resultTable; - +CREATE TEMPORARY VIEW jdbcTable +USING org.apache.spark.sql.jdbc +OPTIONS ( +url "jdbc:ch://localhost:8123/default", +dbtable "schema.tablename", +user "username", +password "password", +driver "com.clickhouse.jdbc.ClickHouseDriver" +); +-- resultTable could be created with df.createTempView or with Spark SQL +INSERT INTO TABLE jdbcTable +SELECT * FROM resultTable; ``` - - ## Parallelism {#parallelism} When using Spark JDBC, Spark reads the data using a single partition. To achieve higher concurrency, you must specify @@ -350,4 +273,4 @@ on [JDBC configurations](https://spark.apache.org/docs/latest/sql-data-sources-j ## JDBC limitations {#jdbc-limitations} * As of today, you can insert data using JDBC only into existing tables (currently there is no way to auto create the - table on DF insertion, as Spark does with other connectors). + table on DF insertion, as Spark does with other connectors). diff --git a/docs/integrations/data-ingestion/apache-spark/spark-native-connector.md b/docs/integrations/data-ingestion/apache-spark/spark-native-connector.md index f017436207e..bb34158b8ce 100644 --- a/docs/integrations/data-ingestion/apache-spark/spark-native-connector.md +++ b/docs/integrations/data-ingestion/apache-spark/spark-native-connector.md @@ -63,86 +63,70 @@ Both approaches ensure the ClickHouse connector is available in your Spark envir - ```maven - com.clickhouse.spark - clickhouse-spark-runtime-{{ spark_binary_version }}_{{ scala_binary_version }} - {{ stable_version }} +com.clickhouse.spark +clickhouse-spark-runtime-{{ spark_binary_version }}_{{ scala_binary_version }} +{{ stable_version }} - com.clickhouse - clickhouse-jdbc - all - {{ clickhouse_jdbc_version }} - - - * - * - - +com.clickhouse +clickhouse-jdbc +all +{{ clickhouse_jdbc_version }} + + +* +* + + ``` - Add the following repository if you want to use SNAPSHOT version. - ```maven - - sonatype-oss-snapshots - Sonatype OSS Snapshots Repository - https://s01.oss.sonatype.org/content/repositories/snapshots - + +sonatype-oss-snapshots +Sonatype OSS Snapshots Repository +https://s01.oss.sonatype.org/content/repositories/snapshots + ``` - - ```gradle dependencies { - implementation("com.clickhouse.spark:clickhouse-spark-runtime-{{ spark_binary_version }}_{{ scala_binary_version }}:{{ stable_version }}") - implementation("com.clickhouse:clickhouse-jdbc:{{ clickhouse_jdbc_version }}:all") { transitive = false } +implementation("com.clickhouse.spark:clickhouse-spark-runtime-{{ spark_binary_version }}_{{ scala_binary_version }}:{{ stable_version }}") +implementation("com.clickhouse:clickhouse-jdbc:{{ clickhouse_jdbc_version }}:all") { transitive = false } } ``` - Add the following repository if you want to use the SNAPSHOT version: - ```gradle repositries { - maven { url = "https://s01.oss.sonatype.org/content/repositories/snapshots" } +maven { url = "https://s01.oss.sonatype.org/content/repositories/snapshots" } } ``` - - ```sbt libraryDependencies += "com.clickhouse" % "clickhouse-jdbc" % {{ clickhouse_jdbc_version }} classifier "all" libraryDependencies += "com.clickhouse.spark" %% clickhouse-spark-runtime-{{ spark_binary_version }}_{{ scala_binary_version }} % {{ stable_version }} ``` - - When working with Spark's shell options (Spark SQL CLI, Spark Shell CLI, and Spark Submit command), the dependencies can be registered by passing the required jars: - ```text $SPARK_HOME/bin/spark-sql \ - --jars /path/clickhouse-spark-runtime-{{ spark_binary_version }}_{{ scala_binary_version }}:{{ stable_version }}.jar,/path/clickhouse-jdbc-{{ clickhouse_jdbc_version }}-all.jar +--jars /path/clickhouse-spark-runtime-{{ spark_binary_version }}_{{ scala_binary_version }}:{{ stable_version }}.jar,/path/clickhouse-jdbc-{{ clickhouse_jdbc_version }}-all.jar ``` - If you want to avoid copying the JAR files to your Spark client node, you can use the following instead: - ```text - --repositories https://{maven-central-mirror or private-nexus-repo} \ - --packages com.clickhouse.spark:clickhouse-spark-runtime-{{ spark_binary_version }}_{{ scala_binary_version }}:{{ stable_version }},com.clickhouse:clickhouse-jdbc:{{ clickhouse_jdbc_version }}:all +--repositories https://{maven-central-mirror or private-nexus-repo} \ +--packages com.clickhouse.spark:clickhouse-spark-runtime-{{ spark_binary_version }}_{{ scala_binary_version }}:{{ stable_version }},com.clickhouse:clickhouse-jdbc:{{ clickhouse_jdbc_version }}:all ``` - Note: For SQL-only use cases, [Apache Kyuubi](https://github.com/apache/kyuubi) is recommended for production. - @@ -193,34 +177,34 @@ These settings could be set via one of the following: * Pass the configuration to your `spark-submit` command (or to your `spark-shell`/`spark-sql` CLI commands). * Add the configuration when initiating your context. -:::important -When working with a ClickHouse cluster, you need to set a unique catalog name for each instance. -For example: - -```text -spark.sql.catalog.clickhouse1 com.clickhouse.spark.ClickHouseCatalog -spark.sql.catalog.clickhouse1.host 10.0.0.1 -spark.sql.catalog.clickhouse1.protocol https -spark.sql.catalog.clickhouse1.http_port 8443 -spark.sql.catalog.clickhouse1.user default -spark.sql.catalog.clickhouse1.password -spark.sql.catalog.clickhouse1.database default -spark.sql.catalog.clickhouse1.option.ssl true - -spark.sql.catalog.clickhouse2 com.clickhouse.spark.ClickHouseCatalog -spark.sql.catalog.clickhouse2.host 10.0.0.2 -spark.sql.catalog.clickhouse2.protocol https -spark.sql.catalog.clickhouse2.http_port 8443 -spark.sql.catalog.clickhouse2.user default -spark.sql.catalog.clickhouse2.password -spark.sql.catalog.clickhouse2.database default -spark.sql.catalog.clickhouse2.option.ssl true -``` - -That way, you would be able to access clickhouse1 table `.` from Spark SQL by -`clickhouse1..`, and access clickhouse2 table `.` by `clickhouse2..`. - -::: + :::important + When working with a ClickHouse cluster, you need to set a unique catalog name for each instance. + For example: + + ```text + spark.sql.catalog.clickhouse1 com.clickhouse.spark.ClickHouseCatalog + spark.sql.catalog.clickhouse1.host 10.0.0.1 + spark.sql.catalog.clickhouse1.protocol https + spark.sql.catalog.clickhouse1.http_port 8443 + spark.sql.catalog.clickhouse1.user default + spark.sql.catalog.clickhouse1.password + spark.sql.catalog.clickhouse1.database default + spark.sql.catalog.clickhouse1.option.ssl true + + spark.sql.catalog.clickhouse2 com.clickhouse.spark.ClickHouseCatalog + spark.sql.catalog.clickhouse2.host 10.0.0.2 + spark.sql.catalog.clickhouse2.protocol https + spark.sql.catalog.clickhouse2.http_port 8443 + spark.sql.catalog.clickhouse2.user default + spark.sql.catalog.clickhouse2.password + spark.sql.catalog.clickhouse2.database default + spark.sql.catalog.clickhouse2.option.ssl true + ``` + + That way, you would be able to access clickhouse1 table `.` from Spark SQL by + `clickhouse1..`, and access clickhouse2 table `.` by `clickhouse2..`. + + ::: ## ClickHouse Cloud settings {#clickhouse-cloud-settings} @@ -235,75 +219,60 @@ spark.sql.catalog.clickhouse.option.ssl_mode NONE - ```java public static void main(String[] args) { - // Create a Spark session - SparkSession spark = SparkSession.builder() - .appName("example") - .master("local[*]") - .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") - .config("spark.sql.catalog.clickhouse.host", "127.0.0.1") - .config("spark.sql.catalog.clickhouse.protocol", "http") - .config("spark.sql.catalog.clickhouse.http_port", "8123") - .config("spark.sql.catalog.clickhouse.user", "default") - .config("spark.sql.catalog.clickhouse.password", "123456") - .config("spark.sql.catalog.clickhouse.database", "default") - .config("spark.clickhouse.write.format", "json") - .getOrCreate(); - - Dataset df = spark.sql("select * from clickhouse.default.example_table"); - - df.show(); - - spark.stop(); - } +// Create a Spark session +SparkSession spark = SparkSession.builder() +.appName("example") +.master("local[*]") +.config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") +.config("spark.sql.catalog.clickhouse.host", "127.0.0.1") +.config("spark.sql.catalog.clickhouse.protocol", "http") +.config("spark.sql.catalog.clickhouse.http_port", "8123") +.config("spark.sql.catalog.clickhouse.user", "default") +.config("spark.sql.catalog.clickhouse.password", "123456") +.config("spark.sql.catalog.clickhouse.database", "default") +.config("spark.clickhouse.write.format", "json") +.getOrCreate(); +Dataset df = spark.sql("select * from clickhouse.default.example_table"); +df.show(); +spark.stop(); +} ``` - - ```java object NativeSparkRead extends App { - val spark = SparkSession.builder - .appName("example") - .master("local[*]") - .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") - .config("spark.sql.catalog.clickhouse.host", "127.0.0.1") - .config("spark.sql.catalog.clickhouse.protocol", "http") - .config("spark.sql.catalog.clickhouse.http_port", "8123") - .config("spark.sql.catalog.clickhouse.user", "default") - .config("spark.sql.catalog.clickhouse.password", "123456") - .config("spark.sql.catalog.clickhouse.database", "default") - .config("spark.clickhouse.write.format", "json") - .getOrCreate - - val df = spark.sql("select * from clickhouse.default.example_table") - - df.show() - - spark.stop() +val spark = SparkSession.builder +.appName("example") +.master("local[*]") +.config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") +.config("spark.sql.catalog.clickhouse.host", "127.0.0.1") +.config("spark.sql.catalog.clickhouse.protocol", "http") +.config("spark.sql.catalog.clickhouse.http_port", "8123") +.config("spark.sql.catalog.clickhouse.user", "default") +.config("spark.sql.catalog.clickhouse.password", "123456") +.config("spark.sql.catalog.clickhouse.database", "default") +.config("spark.clickhouse.write.format", "json") +.getOrCreate +val df = spark.sql("select * from clickhouse.default.example_table") +df.show() +spark.stop() } ``` - - ```python from pyspark.sql import SparkSession - packages = [ - "com.clickhouse.spark:clickhouse-spark-runtime-3.4_2.12:0.8.0", - "com.clickhouse:clickhouse-client:0.7.0", - "com.clickhouse:clickhouse-http-client:0.7.0", - "org.apache.httpcomponents.client5:httpclient5:5.2.1" - +"com.clickhouse.spark:clickhouse-spark-runtime-3.4_2.12:0.8.0", +"com.clickhouse:clickhouse-client:0.7.0", +"com.clickhouse:clickhouse-http-client:0.7.0", +"org.apache.httpcomponents.client5:httpclient5:5.2.1" ] - spark = (SparkSession.builder - .config("spark.jars.packages", ",".join(packages)) - .getOrCreate()) - +.config("spark.jars.packages", ",".join(packages)) +.getOrCreate()) spark.conf.set("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") spark.conf.set("spark.sql.catalog.clickhouse.host", "127.0.0.1") spark.conf.set("spark.sql.catalog.clickhouse.protocol", "http") @@ -312,29 +281,23 @@ spark.conf.set("spark.sql.catalog.clickhouse.user", "default") spark.conf.set("spark.sql.catalog.clickhouse.password", "123456") spark.conf.set("spark.sql.catalog.clickhouse.database", "default") spark.conf.set("spark.clickhouse.write.format", "json") - df = spark.sql("select * from clickhouse.default.example_table") df.show() - ``` - - ```sql - CREATE TEMPORARY VIEW jdbcTable - USING org.apache.spark.sql.jdbc - OPTIONS ( - url "jdbc:ch://localhost:8123/default", - dbtable "schema.tablename", - user "username", - password "password", - driver "com.clickhouse.jdbc.ClickHouseDriver" - ); - - SELECT * FROM jdbcTable; +CREATE TEMPORARY VIEW jdbcTable +USING org.apache.spark.sql.jdbc +OPTIONS ( +url "jdbc:ch://localhost:8123/default", +dbtable "schema.tablename", +user "username", +password "password", +driver "com.clickhouse.jdbc.ClickHouseDriver" +); +SELECT * FROM jdbcTable; ``` - @@ -342,103 +305,83 @@ df.show() - ```java - public static void main(String[] args) throws AnalysisException { - - // Create a Spark session - SparkSession spark = SparkSession.builder() - .appName("example") - .master("local[*]") - .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") - .config("spark.sql.catalog.clickhouse.host", "127.0.0.1") - .config("spark.sql.catalog.clickhouse.protocol", "http") - .config("spark.sql.catalog.clickhouse.http_port", "8123") - .config("spark.sql.catalog.clickhouse.user", "default") - .config("spark.sql.catalog.clickhouse.password", "123456") - .config("spark.sql.catalog.clickhouse.database", "default") - .config("spark.clickhouse.write.format", "json") - .getOrCreate(); - - // Define the schema for the DataFrame - StructType schema = new StructType(new StructField[]{ - DataTypes.createStructField("id", DataTypes.IntegerType, false), - DataTypes.createStructField("name", DataTypes.StringType, false), - }); - - - List data = Arrays.asList( - RowFactory.create(1, "Alice"), - RowFactory.create(2, "Bob") - ); - - // Create a DataFrame - Dataset df = spark.createDataFrame(data, schema); - - df.writeTo("clickhouse.default.example_table").append(); - - spark.stop(); - } +public static void main(String[] args) throws AnalysisException { +// Create a Spark session +SparkSession spark = SparkSession.builder() +.appName("example") +.master("local[*]") +.config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") +.config("spark.sql.catalog.clickhouse.host", "127.0.0.1") +.config("spark.sql.catalog.clickhouse.protocol", "http") +.config("spark.sql.catalog.clickhouse.http_port", "8123") +.config("spark.sql.catalog.clickhouse.user", "default") +.config("spark.sql.catalog.clickhouse.password", "123456") +.config("spark.sql.catalog.clickhouse.database", "default") +.config("spark.clickhouse.write.format", "json") +.getOrCreate(); +// Define the schema for the DataFrame +StructType schema = new StructType(new StructField[]{ +DataTypes.createStructField("id", DataTypes.IntegerType, false), +DataTypes.createStructField("name", DataTypes.StringType, false), +}); +List data = Arrays.asList( +RowFactory.create(1, "Alice"), +RowFactory.create(2, "Bob") +); +// Create a DataFrame +Dataset df = spark.createDataFrame(data, schema); +df.writeTo("clickhouse.default.example_table").append(); +spark.stop(); +} ``` - - ```java object NativeSparkWrite extends App { - // Create a Spark session - val spark: SparkSession = SparkSession.builder - .appName("example") - .master("local[*]") - .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") - .config("spark.sql.catalog.clickhouse.host", "127.0.0.1") - .config("spark.sql.catalog.clickhouse.protocol", "http") - .config("spark.sql.catalog.clickhouse.http_port", "8123") - .config("spark.sql.catalog.clickhouse.user", "default") - .config("spark.sql.catalog.clickhouse.password", "123456") - .config("spark.sql.catalog.clickhouse.database", "default") - .config("spark.clickhouse.write.format", "json") - .getOrCreate - - // Define the schema for the DataFrame - val rows = Seq(Row(1, "John"), Row(2, "Doe")) - - val schema = List( - StructField("id", DataTypes.IntegerType, nullable = false), - StructField("name", StringType, nullable = true) - ) - // Create the df - val df: DataFrame = spark.createDataFrame( - spark.sparkContext.parallelize(rows), - StructType(schema) - ) - - df.writeTo("clickhouse.default.example_table").append() - - spark.stop() +// Create a Spark session +val spark: SparkSession = SparkSession.builder +.appName("example") +.master("local[*]") +.config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") +.config("spark.sql.catalog.clickhouse.host", "127.0.0.1") +.config("spark.sql.catalog.clickhouse.protocol", "http") +.config("spark.sql.catalog.clickhouse.http_port", "8123") +.config("spark.sql.catalog.clickhouse.user", "default") +.config("spark.sql.catalog.clickhouse.password", "123456") +.config("spark.sql.catalog.clickhouse.database", "default") +.config("spark.clickhouse.write.format", "json") +.getOrCreate +// Define the schema for the DataFrame +val rows = Seq(Row(1, "John"), Row(2, "Doe")) +val schema = List( +StructField("id", DataTypes.IntegerType, nullable = false), +StructField("name", StringType, nullable = true) +) +// Create the df +val df: DataFrame = spark.createDataFrame( +spark.sparkContext.parallelize(rows), +StructType(schema) +) +df.writeTo("clickhouse.default.example_table").append() +spark.stop() } ``` - - ```python from pyspark.sql import SparkSession from pyspark.sql import Row - # Feel free to use any other packages combination satesfying the compatability martix provided above. packages = [ - "com.clickhouse.spark:clickhouse-spark-runtime-3.4_2.12:0.8.0", - "com.clickhouse:clickhouse-client:0.7.0", - "com.clickhouse:clickhouse-http-client:0.7.0", - "org.apache.httpcomponents.client5:httpclient5:5.2.1" - +"com.clickhouse.spark:clickhouse-spark-runtime-3.4_2.12:0.8.0", +"com.clickhouse:clickhouse-client:0.7.0", +"com.clickhouse:clickhouse-http-client:0.7.0", +"org.apache.httpcomponents.client5:httpclient5:5.2.1" ] - spark = (SparkSession.builder - .config("spark.jars.packages", ",".join(packages)) - .getOrCreate()) - +.config("spark.jars.packages", ",".join(packages)) +.getOrCreate()) spark.conf.set("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") spark.conf.set("spark.sql.catalog.clickhouse.host", "127.0.0.1") spark.conf.set("spark.sql.catalog.clickhouse.protocol", "http") @@ -447,28 +390,19 @@ spark.conf.set("spark.sql.catalog.clickhouse.user", "default") spark.conf.set("spark.sql.catalog.clickhouse.password", "123456") spark.conf.set("spark.sql.catalog.clickhouse.database", "default") spark.conf.set("spark.clickhouse.write.format", "json") - # Create DataFrame data = [Row(id=11, name="John"), Row(id=12, name="Doe")] df = spark.createDataFrame(data) - # Write DataFrame to ClickHouse df.writeTo("clickhouse.default.example_table").append() - - - ``` - - ```sql - -- resultTalbe is the Spark intermediate df we want to insert into clickhouse.default.example_table - INSERT INTO TABLE clickhouse.default.example_table - SELECT * FROM resultTable; - +-- resultTalbe is the Spark intermediate df we want to insert into clickhouse.default.example_table +INSERT INTO TABLE clickhouse.default.example_table +SELECT * FROM resultTable; ``` - @@ -481,7 +415,7 @@ so you can directly execute commands such as CREATE TABLE, TRUNCATE, and more - ```sql -USE clickhouse; +USE clickhouse; CREATE TABLE test_db.tbl_sql ( create_time TIMESTAMP NOT NULL, diff --git a/docs/integrations/data-ingestion/aws-glue/index.md b/docs/integrations/data-ingestion/aws-glue/index.md index b36ed49d481..057eeb994eb 100644 --- a/docs/integrations/data-ingestion/aws-glue/index.md +++ b/docs/integrations/data-ingestion/aws-glue/index.md @@ -14,12 +14,10 @@ import TabItem from '@theme/TabItem'; [Amazon Glue](https://aws.amazon.com/glue/) is a fully managed, serverless data integration service provided by Amazon Web Services (AWS). It simplifies the process of discovering, preparing, and transforming data for analytics, machine learning, and application development. - Although there is no Glue ClickHouse connector available yet, the official JDBC connector can be leveraged to connect and integrate with ClickHouse: - ```java import com.amazonaws.services.glue.util.Job import com.amazonaws.services.glue.util.GlueArgParser @@ -29,42 +27,34 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.DataFrame import scala.collection.JavaConverters._ import com.amazonaws.services.glue.log.GlueLogger - - // Initialize Glue job object GlueJob { - def main(sysArgs: Array[String]) { - val sc: SparkContext = new SparkContext() - val glueContext: GlueContext = new GlueContext(sc) - val spark: SparkSession = glueContext.getSparkSession - val logger = new GlueLogger - import spark.implicits._ - // @params: [JOB_NAME] - val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME").toArray) - Job.init(args("JOB_NAME"), glueContext, args.asJava) - - // JDBC connection details - val jdbcUrl = "jdbc:ch://{host}:{port}/{schema}" - val jdbcProperties = new java.util.Properties() - jdbcProperties.put("user", "default") - jdbcProperties.put("password", "*******") - jdbcProperties.put("driver", "com.clickhouse.jdbc.ClickHouseDriver") - - // Load the table from ClickHouse - val df: DataFrame = spark.read.jdbc(jdbcUrl, "my_table", jdbcProperties) - - // Show the Spark df, or use it for whatever you like - df.show() - - // Commit the job - Job.commit() - } +def main(sysArgs: Array[String]) { +val sc: SparkContext = new SparkContext() +val glueContext: GlueContext = new GlueContext(sc) +val spark: SparkSession = glueContext.getSparkSession +val logger = new GlueLogger +import spark.implicits._ +// @params: [JOB_NAME] +val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME").toArray) +Job.init(args("JOB_NAME"), glueContext, args.asJava) +// JDBC connection details +val jdbcUrl = "jdbc:ch://{host}:{port}/{schema}" +val jdbcProperties = new java.util.Properties() +jdbcProperties.put("user", "default") +jdbcProperties.put("password", "*******") +jdbcProperties.put("driver", "com.clickhouse.jdbc.ClickHouseDriver") +// Load the table from ClickHouse +val df: DataFrame = spark.read.jdbc(jdbcUrl, "my_table", jdbcProperties) +// Show the Spark df, or use it for whatever you like +df.show() +// Commit the job +Job.commit() +} } ``` - - ```python import sys from awsglue.transforms import * @@ -72,10 +62,8 @@ from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job - ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) - sc = SparkContext() glueContext = GlueContext(sc) logger = glueContext.get_logger() @@ -86,25 +74,19 @@ jdbc_url = "jdbc:ch://{host}:{port}/{schema}" query = "select * from my_table" # For cloud usage, please add ssl options df = (spark.read.format("jdbc") - .option("driver", 'com.clickhouse.jdbc.ClickHouseDriver') - .option("url", jdbc_url) - .option("user", 'default') - .option("password", '*******') - .option("query", query) - .load()) - +.option("driver", 'com.clickhouse.jdbc.ClickHouseDriver') +.option("url", jdbc_url) +.option("user", 'default') +.option("password", '*******') +.option("query", query) +.load()) logger.info("num of rows:") logger.info(str(df.count())) logger.info("Data sample:") logger.info(str(df.take(10))) - - job.commit() ``` - For more details, please visit our [Spark & JDBC documentation](/integrations/apache-spark/spark-jdbc#read-data). - - diff --git a/docs/integrations/data-ingestion/azure-data-factory/overview.md b/docs/integrations/data-ingestion/azure-data-factory/overview.md index fb5aa71e492..b2de22d0af0 100644 --- a/docs/integrations/data-ingestion/azure-data-factory/overview.md +++ b/docs/integrations/data-ingestion/azure-data-factory/overview.md @@ -22,4 +22,3 @@ into ClickHouse: |----------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Using the `azureBlobStorage` Table Function](./using_azureblobstorage.md) | Involves using ClickHouse's [`azureBlobStorage` Table Function](https://clickhouse.com/docs/sql-reference/table-functions/azureBlobStorage) to transfer data directly from Azure Blob Storage. | | [Using the ClickHouse HTTP interface](./using_http_interface.md) | Uses the [ClickHouse HTTP interface](https://clickhouse.com/docs/interfaces/http) as a data source within Azure Data Factory, allowing you to copy data or use it in data flow activities as part of your pipelines. | - diff --git a/docs/integrations/data-ingestion/azure-data-factory/using_azureblobstorage.md b/docs/integrations/data-ingestion/azure-data-factory/using_azureblobstorage.md index 910b145215a..c356b268ed7 100644 --- a/docs/integrations/data-ingestion/azure-data-factory/using_azureblobstorage.md +++ b/docs/integrations/data-ingestion/azure-data-factory/using_azureblobstorage.md @@ -36,12 +36,12 @@ To allow ClickHouse to access your Azure Blob Storage, you'll need a connection 1. In the Azure portal, navigate to your **Storage Account**. 2. In the left-hand menu, select **Access keys** under the **Security + - networking** section. - + networking** section. + 3. Choose either **key1** or **key2**, and click the **Show** button next to - the **Connection string** field. - + the **Connection string** field. + 4. Copy the connection string — you'll use this as a parameter in the azureBlobStorage table function. @@ -84,40 +84,40 @@ As an example we will download a single file from the Environmental Sensors Dataset. 1. Download a [sample file](https://clickhouse-public-datasets.s3.eu-central-1.amazonaws.com/sensors/monthly/2019-06_bmp180.csv.zst) - from the [Environmental Sensors Dataset](https://clickhouse.com/docs/getting-started/example-datasets/environmental-sensors) + from the [Environmental Sensors Dataset](https://clickhouse.com/docs/getting-started/example-datasets/environmental-sensors) 2. In the Azure Portal, create a new storage account if you don't already have one. -:::warning -Make sure that **Allow storage account key access** is enabled for your storage -account, otherwise you will not be able to use the account keys to access the -data. -::: + :::warning + Make sure that **Allow storage account key access** is enabled for your storage + account, otherwise you will not be able to use the account keys to access the + data. + ::: 3. Create a new container in your storage account. In this example, we name it sensors. - You can skip this step if you're using an existing container. + You can skip this step if you're using an existing container. 4. Upload the previously downloaded `2019-06_bmp180.csv.zst` file to the - container. + container. 5. Follow the steps described earlier to obtain the Azure Blob Storage - connection string. + connection string. -Now that everything is set up, you can query the data directly from Azure Blob Storage: + Now that everything is set up, you can query the data directly from Azure Blob Storage: ```sql SELECT * FROM azureBlobStorage( - '', + '', 'sensors', - '2019-06_bmp180.csv.zst', + '2019-06_bmp180.csv.zst', 'CSVWithNames') LIMIT 10 SETTINGS format_csv_delimiter = ';' ``` 7. To load the data into a table, create a simplified version of the - schema used in the original dataset: + schema used in the original dataset: ```sql CREATE TABLE sensors ( @@ -131,26 +131,26 @@ Now that everything is set up, you can query the data directly from Azure Blob S ORDER BY (timestamp, sensor_id); ``` -:::info -For more information on configuration options and schema inference when -querying external sources like Azure Blob Storage, see [Automatic schema -inference from input data](https://clickhouse.com/docs/interfaces/schema-inference) -::: + :::info + For more information on configuration options and schema inference when + querying external sources like Azure Blob Storage, see [Automatic schema + inference from input data](https://clickhouse.com/docs/interfaces/schema-inference) + ::: 8. Now insert the data from Azure Blob Storage into the sensors table: ```sql INSERT INTO sensors SELECT sensor_id, lat, lon, timestamp, temperature FROM azureBlobStorage( - '', + '', 'sensors', - '2019-06_bmp180.csv.zst', + '2019-06_bmp180.csv.zst', 'CSVWithNames') SETTINGS format_csv_delimiter = ';' ``` -Your sensors table is now populated with data from the `2019-06_bmp180.csv.zst` -file stored in Azure Blob Storage. + Your sensors table is now populated with data from the `2019-06_bmp180.csv.zst` + file stored in Azure Blob Storage. ## Additional resources {#additional-resources} diff --git a/docs/integrations/data-ingestion/azure-data-factory/using_http_interface.md b/docs/integrations/data-ingestion/azure-data-factory/using_http_interface.md index 7d245d5fdab..fcfdafb60dd 100644 --- a/docs/integrations/data-ingestion/azure-data-factory/using_http_interface.md +++ b/docs/integrations/data-ingestion/azure-data-factory/using_http_interface.md @@ -42,25 +42,25 @@ ClickHouse. Using it may however not always be suitable for the following reason - Your data might not be stored in Azure Blob Storage — for example, it could be in Azure SQL Database, Microsoft SQL Server, or Cosmos DB. - Security policies might prevent external access to Blob Storage - altogether — for example, if the storage account is locked down with no public endpoint. - -In such scenarios, you can use Azure Data Factory together with the -[ClickHouse HTTP interface](https://clickhouse.com/docs/interfaces/http) -to send data from Azure services into ClickHouse. - -This method reverses the flow: instead of having ClickHouse pull the data from -Azure, Azure Data Factory pushes the data to ClickHouse. This approach -typically requires your ClickHouse instance to be accessible from the public -internet. - -:::info -It is possible to avoid exposing your ClickHouse instance to the internet by -using Azure Data Factory's Self-hosted Integration Runtime. This setup allows -data to be sent over a private network. However, it's beyond the scope of this -article. You can find more information in the official guide: -[Create and configure a self-hosted integration -runtime](https://learn.microsoft.com/en-us/azure/data-factory/create-self-hosted-integration-runtime?tabs=data-factory) -::: + altogether — for example, if the storage account is locked down with no public endpoint. + + In such scenarios, you can use Azure Data Factory together with the + [ClickHouse HTTP interface](https://clickhouse.com/docs/interfaces/http) + to send data from Azure services into ClickHouse. + + This method reverses the flow: instead of having ClickHouse pull the data from + Azure, Azure Data Factory pushes the data to ClickHouse. This approach + typically requires your ClickHouse instance to be accessible from the public + internet. + + :::info + It is possible to avoid exposing your ClickHouse instance to the internet by + using Azure Data Factory's Self-hosted Integration Runtime. This setup allows + data to be sent over a private network. However, it's beyond the scope of this + article. You can find more information in the official guide: + [Create and configure a self-hosted integration + runtime](https://learn.microsoft.com/en-us/azure/data-factory/create-self-hosted-integration-runtime?tabs=data-factory) + ::: ## Turning ClickHouse into a REST service {#turning-clickhouse-to-a-rest-service} @@ -76,7 +76,7 @@ parsing. ```sql INSERT INTO my_table -SETTINGS +SETTINGS date_time_input_format='best_effort', input_format_json_read_objects_as_strings=1 FORMAT JSONEachRow @@ -126,75 +126,75 @@ an Azure Data Factory already configured, then you can safely skip this step and move to the next one using your existing service. 1. Log in to the [Microsoft Azure Portal](https://portal.azure.com/) and click - **Create a resource**. - + **Create a resource**. + 2. In the Categories pane on the left, select **Analytics**, then click on - **Data Factory** in the list of popular services. - + **Data Factory** in the list of popular services. + 3. Select your subscription and resource group, enter a name for the new Data - Factory instance, choose the region and leave the version as V2. - + Factory instance, choose the region and leave the version as V2. + 3. Click **Review + Create**, then click **Create** to launch the deployment. - + - + -Once the deployment completes successfully, you can start using your new Azure -Data Factory instance. + Once the deployment completes successfully, you can start using your new Azure + Data Factory instance. ## Creating a new REST-Based linked service {#-creating-new-rest-based-linked-service} 1. Log in to the Microsoft Azure Portal and open your Data Factory instance. - + 2. On the Data Factory overview page, click **Launch Studio**. - + 3. In the left-hand menu, select **Manage**, then go to **Linked services**, - and click **+ New** to create a new linked service. - + and click **+ New** to create a new linked service. + 4. In the **New linked service search bar**, type **REST**, select **REST**, and click **Continue** - to create [a REST connector](https://learn.microsoft.com/en-us/azure/data-factory/connector-rest) - instance. - + to create [a REST connector](https://learn.microsoft.com/en-us/azure/data-factory/connector-rest) + instance. + 5. In the linked service configuration pane enter a name for your new service, - click the **Base URL** field, then click **Add dynamic content** (this link only - appears when the field is selected). - + click the **Base URL** field, then click **Add dynamic content** (this link only + appears when the field is selected). + 6. In the dynamic content pane you can create a parameterized URL, which - allows you to define the query later when creating datasets for different - tables — this makes the linked service reusable. - + allows you to define the query later when creating datasets for different + tables — this makes the linked service reusable. + 7. Click the **"+"** next to the filter input and add a new parameter, name it - `pQuery`, set the type to String, and set the default value to `SELECT 1`. - Click **Save**. - + `pQuery`, set the type to String, and set the default value to `SELECT 1`. + Click **Save**. + 8. In the expression field, enter the following and click **OK**. Replace - `your-clickhouse-url.com` with the actual address of your ClickHouse - instance. - ```text - @{concat('https://your-clickhouse-url.com:8443/?query=', encodeUriComponent(linkedService().pQuery))} - ``` - + `your-clickhouse-url.com` with the actual address of your ClickHouse + instance. + ```text + @{concat('https://your-clickhouse-url.com:8443/?query=', encodeUriComponent(linkedService().pQuery))} + ``` + 9. Back in the main form select Basic authentication, enter the username and - password used to connect to your ClickHouse HTTP interface, click **Test - connection**. If everything is configured correctly, you'll see a success - message. - + password used to connect to your ClickHouse HTTP interface, click **Test + connection**. If everything is configured correctly, you'll see a success + message. + 10. Click **Create** to finalize the setup. -You should now see your newly registered REST-based linked service in the list. + You should now see your newly registered REST-based linked service in the list. ## Creating a new dataset for the ClickHouse HTTP Interface {#creating-a-new-dataset-for-the-clickhouse-http-interface} @@ -206,60 +206,60 @@ In this example, we'll insert a small portion of the [Environmental Sensors Data](https://clickhouse.com/docs/getting-started/example-datasets/environmental-sensors). 1. Open the ClickHouse query console of your choice — this could be the - ClickHouse Cloud web UI, the CLI client, or any other interface you use to - run queries — and create the target table: - ```sql - CREATE TABLE sensors - ( + ClickHouse Cloud web UI, the CLI client, or any other interface you use to + run queries — and create the target table: + ```sql + CREATE TABLE sensors + ( sensor_id UInt16, lat Float32, lon Float32, timestamp DateTime, temperature Float32 - ) - ENGINE = MergeTree - ORDER BY (timestamp, sensor_id); - ``` + ) + ENGINE = MergeTree + ORDER BY (timestamp, sensor_id); + ``` 2. In Azure Data Factory Studio, select Author in the left-hand pane. Hover - over the Dataset item, click the three-dot icon, and choose New dataset. - + over the Dataset item, click the three-dot icon, and choose New dataset. + 3. In the search bar, type **REST**, select **REST**, and click **Continue**. - Enter a name for your dataset and select the **linked service** you created - in the previous step. Click **OK** to create the dataset. - + Enter a name for your dataset and select the **linked service** you created + in the previous step. Click **OK** to create the dataset. + 4. You should now see your newly created dataset listed under the Datasets - section in the Factory Resources pane on the left. Select the dataset to - open its properties. You'll see the `pQuery` parameter that was defined in the - linked service. Click the **Value** text field. Then click **Add dynamic** - content. - + section in the Factory Resources pane on the left. Select the dataset to + open its properties. You'll see the `pQuery` parameter that was defined in the + linked service. Click the **Value** text field. Then click **Add dynamic** + content. + 5. In the pane that opens, paste the following query: - ```sql - INSERT INTO sensors - SETTINGS - date_time_input_format=''best_effort'', - input_format_json_read_objects_as_strings=1 - FORMAT JSONEachRow - ``` - - :::danger - All single quotes `'` in the query must be replaced with two single quotes - `''`. This is required by Azure Data Factory's expression parser. If you - don't escape them, you may not see an error immediately — but it will fail - later when you try to use or save the dataset. For example, `'best_effort'` - must be written as `''best_effort''`. - ::: - - + ```sql + INSERT INTO sensors + SETTINGS + date_time_input_format=''best_effort'', + input_format_json_read_objects_as_strings=1 + FORMAT JSONEachRow + ``` + + :::danger + All single quotes `'` in the query must be replaced with two single quotes + `''`. This is required by Azure Data Factory's expression parser. If you + don't escape them, you may not see an error immediately — but it will fail + later when you try to use or save the dataset. For example, `'best_effort'` + must be written as `''best_effort''`. + ::: + + 6. Click OK to save the expression. Click Test connection. If everything is - configured correctly, you'll see a Connection successful message. Click Publish - all at the top of the page to save your changes. - + configured correctly, you'll see a Connection successful message. Click Publish + all at the top of the page to save your changes. + ### Setting up an example dataset {#setting-up-an-example-dataset} @@ -286,38 +286,37 @@ Now that we've configured both the input and output datasets, we can set up a `sensors` table in ClickHouse. 1. Open **Azure Data Factory Studio**, go to the **Author tab**. In the - **Factory Resources** pane, hover over **Pipeline**, click the three-dot - icon, and select **New pipeline**. - + **Factory Resources** pane, hover over **Pipeline**, click the three-dot + icon, and select **New pipeline**. + 2. In the **Activities** pane, expand the **Move and transform** section and - drag the **Copy data** activity onto the canvas. - + drag the **Copy data** activity onto the canvas. + 3. Select the **Source** tab, and choose the source dataset you created earlier. - + 4. Go to the **Sink** tab and select the ClickHouse dataset created for your - sensors table. Set **Request method** to POST. Ensure **HTTP compression - type** is set to **None**. - :::warning - HTTP compression does not work correctly in Azure Data Factory's Copy Data - activity. When enabled, Azure sends a payload consisting of zero bytes only - — likely a bug in the service. Be sure to leave compression disabled. - ::: - :::info - We recommend keeping the default batch size of 10,000, or even increasing it - further. For more details, see - [Selecting an Insert Strategy / Batch inserts if synchronous](https://clickhouse.com/docs/best-practices/selecting-an-insert-strategy#batch-inserts-if-synchronous) - for more details. - ::: - - + sensors table. Set **Request method** to POST. Ensure **HTTP compression + type** is set to **None**. + :::warning + HTTP compression does not work correctly in Azure Data Factory's Copy Data + activity. When enabled, Azure sends a payload consisting of zero bytes only + — likely a bug in the service. Be sure to leave compression disabled. + ::: + :::info + We recommend keeping the default batch size of 10,000, or even increasing it + further. For more details, see + [Selecting an Insert Strategy / Batch inserts if synchronous](https://clickhouse.com/docs/best-practices/selecting-an-insert-strategy#batch-inserts-if-synchronous) + for more details. + ::: + 5. Click **Debug** at the top of the canvas to run the pipeline. After a short - wait, the activity will be queued and executed. If everything is configured - correctly, the task should finish with a **Success** status. - + wait, the activity will be queued and executed. If everything is configured + correctly, the task should finish with a **Success** status. + 6. Once complete, click **Publish all** to save your pipeline and dataset changes. diff --git a/docs/integrations/data-ingestion/azure-synapse/index.md b/docs/integrations/data-ingestion/azure-synapse/index.md index 9da775c7d7b..b3205151ba4 100644 --- a/docs/integrations/data-ingestion/azure-synapse/index.md +++ b/docs/integrations/data-ingestion/azure-synapse/index.md @@ -18,7 +18,6 @@ Within Synapse, Spark pools provide on-demand, scalable [Apache Spark](https://s This article will show you how to integrate the [ClickHouse Spark connector](/integrations/apache-spark/spark-native-connector) when working with Apache Spark within Azure Synapse. - ## Add the connector's dependencies {#add-connector-dependencies} @@ -27,13 +26,11 @@ Azure Synapse supports three levels of [packages maintenance](https://learn.micr 2. Spark pool level 3. Session level -
- -Follow the [Manage libraries for Apache Spark pools guide](https://learn.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-manage-pool-packages) and add the following required dependencies to your Spark application - - `clickhouse-spark-runtime-{spark_version}_{scala_version}-{connector_version}.jar` - [official maven](https://mvnrepository.com/artifact/com.clickhouse.spark) - - `clickhouse-jdbc-{java_client_version}-all.jar` - [official maven](https://mvnrepository.com/artifact/com.clickhouse/clickhouse-jdbc) + Follow the [Manage libraries for Apache Spark pools guide](https://learn.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-manage-pool-packages) and add the following required dependencies to your Spark application + - `clickhouse-spark-runtime-{spark_version}_{scala_version}-{connector_version}.jar` - [official maven](https://mvnrepository.com/artifact/com.clickhouse.spark) + - `clickhouse-jdbc-{java_client_version}-all.jar` - [official maven](https://mvnrepository.com/artifact/com.clickhouse/clickhouse-jdbc) -Please visit our [Spark Connector Compatibility Matrix](/integrations/apache-spark/spark-native-connector#compatibility-matrix) docs to understand which versions suit your needs. + Please visit our [Spark Connector Compatibility Matrix](/integrations/apache-spark/spark-native-connector#compatibility-matrix) docs to understand which versions suit your needs. ## Add ClickHouse as a catalog {#add-clickhouse-as-catalog} @@ -42,14 +39,14 @@ There are a variety of ways to add Spark configs to your session: * Add configurations via Azure Synapse UI * Add configurations in your Synapse notebook -Follow this [Manage Apache Spark configuration](https://learn.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-create-spark-configuration) -and add the [connector required Spark configurations](/integrations/apache-spark/spark-native-connector#register-the-catalog-required). + Follow this [Manage Apache Spark configuration](https://learn.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-create-spark-configuration) + and add the [connector required Spark configurations](/integrations/apache-spark/spark-native-connector#register-the-catalog-required). -For instance, you can configure your Spark session in your notebook with these settings: + For instance, you can configure your Spark session in your notebook with these settings: -```python -%%configure -f -{ + ```python + %%configure -f + { "conf": { "spark.sql.catalog.clickhouse": "com.clickhouse.spark.ClickHouseCatalog", "spark.sql.catalog.clickhouse.host": "", @@ -59,18 +56,18 @@ For instance, you can configure your Spark session in your notebook with these s "spark.sql.catalog.clickhouse.password": "password", "spark.sql.catalog.clickhouse.database": "default" } -} -``` + } + ``` -Make sure it will be in the first cell as follows: + Make sure it will be in the first cell as follows: - + -Please visit the [ClickHouse Spark configurations page](/integrations/apache-spark/spark-native-connector#configurations) for additional settings. + Please visit the [ClickHouse Spark configurations page](/integrations/apache-spark/spark-native-connector#configurations) for additional settings. -:::info -When working with ClickHouse Cloud Please make sure to set the [required Spark settings](/integrations/apache-spark/spark-native-connector#clickhouse-cloud-settings). -::: + :::info + When working with ClickHouse Cloud Please make sure to set the [required Spark settings](/integrations/apache-spark/spark-native-connector#clickhouse-cloud-settings). + ::: ## Setup verification {#setup-verification} @@ -79,7 +76,6 @@ There, look for your ClickHouse related settings: - ## Additional resources {#additional-resources} - [ClickHouse Spark Connector Docs](/integrations/apache-spark) diff --git a/docs/integrations/data-ingestion/clickpipes/aws-privatelink.md b/docs/integrations/data-ingestion/clickpipes/aws-privatelink.md index 1b93ff82aa7..b2816a036dc 100644 --- a/docs/integrations/data-ingestion/clickpipes/aws-privatelink.md +++ b/docs/integrations/data-ingestion/clickpipes/aws-privatelink.md @@ -125,7 +125,7 @@ You are ready to [create a ClickPipe with Reverse private endpoint](#creating-cl - Set `Resource configuration ID` to the ID of the Resource-Configuration created in step 2. - Set `Resource share ARN` to the ARN of the Resource-Share created in step 3. -For more details on PrivateLink with VPC resource, see [AWS documentation](https://docs.aws.amazon.com/vpc/latest/privatelink/privatelink-access-resources.html). + For more details on PrivateLink with VPC resource, see [AWS documentation](https://docs.aws.amazon.com/vpc/latest/privatelink/privatelink-access-resources.html). ### MSK multi-VPC connectivity {#msk-multi-vpc} @@ -158,50 +158,50 @@ It's a preferred choice for: - [Cross-region connectivity for Postgres CDC](/knowledgebase/aws-privatelink-setup-for-clickpipes) - Cross-region connectivity for MSK cluster. Please reach out to the ClickHouse support team for assistance. -See the [getting started](https://docs.aws.amazon.com/vpc/latest/privatelink/privatelink-share-your-services.html) guide for more details. + See the [getting started](https://docs.aws.amazon.com/vpc/latest/privatelink/privatelink-share-your-services.html) guide for more details. -:::info -Add ClickPipes account ID `072088201116` to the allowed principals to your VPC endpoint service. -See AWS guide for [managing permissions](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#add-remove-permissions) for more details. -::: + :::info + Add ClickPipes account ID `072088201116` to the allowed principals to your VPC endpoint service. + See AWS guide for [managing permissions](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#add-remove-permissions) for more details. + ::: -:::info -[Cross-region access](https://docs.aws.amazon.com/vpc/latest/privatelink/privatelink-share-your-services.html#endpoint-service-cross-region) -can be configured for ClickPipes. Add [your ClickPipe region](#aws-privatelink-regions) to the allowed regions in your VPC endpoint service. -::: + :::info + [Cross-region access](https://docs.aws.amazon.com/vpc/latest/privatelink/privatelink-share-your-services.html#endpoint-service-cross-region) + can be configured for ClickPipes. Add [your ClickPipe region](#aws-privatelink-regions) to the allowed regions in your VPC endpoint service. + ::: ## Creating a ClickPipe with reverse private endpoint {#creating-clickpipe} 1. Access the SQL Console for your ClickHouse Cloud Service. - + 2. Select the `Data Sources` button on the left-side menu and click on "Set up a ClickPipe" - + 3. Select either Kafka or Postgres as a data source. - + 4. Select the `Reverse private endpoint` option. - + 5. Select any of existing reverse private endpoints or create a new one. -:::info -If cross-region access is required for RDS, you need to create a VPC endpoint service and -[this guide should provide](/knowledgebase/aws-privatelink-setup-for-clickpipes) a good starting point to set it up. + :::info + If cross-region access is required for RDS, you need to create a VPC endpoint service and + [this guide should provide](/knowledgebase/aws-privatelink-setup-for-clickpipes) a good starting point to set it up. -For same-region access, creating a VPC Resource is the recommended approach. -::: + For same-region access, creating a VPC Resource is the recommended approach. + ::: - + 6. Provide the required parameters for the selected endpoint type. - + - For VPC resource, provide the configuration share ARN and configuration ID. - For MSK multi-VPC, provide the cluster ARN and authentication method used with a created endpoint. @@ -209,22 +209,22 @@ For same-region access, creating a VPC Resource is the recommended approach. 7. Click on `Create` and wait for the reverse private endpoint to be ready. - If you are creating a new endpoint, it will take some time to set up the endpoint. - The page will refresh automatically once the endpoint is ready. - VPC endpoint service might require accepting the connection request in your AWS console. + If you are creating a new endpoint, it will take some time to set up the endpoint. + The page will refresh automatically once the endpoint is ready. + VPC endpoint service might require accepting the connection request in your AWS console. - + 8. Once the endpoint is ready, you can use a DNS name to connect to the data source. - On a list of endpoints, you can see the DNS name for the available endpoint. - It can be either an internally ClickPipes provisioned DNS name or a private DNS name supplied by a PrivateLink service. - DNS name is not a complete network address. - Add the port according to the data source. + On a list of endpoints, you can see the DNS name for the available endpoint. + It can be either an internally ClickPipes provisioned DNS name or a private DNS name supplied by a PrivateLink service. + DNS name is not a complete network address. + Add the port according to the data source. - MSK connection string can be accessed in the AWS console. + MSK connection string can be accessed in the AWS console. - To see a full list of DNS names, access it in the cloud service settings. + To see a full list of DNS names, access it in the cloud service settings. ## Managing existing reverse private endpoints {#managing-existing-endpoints} @@ -232,11 +232,11 @@ You can manage existing reverse private endpoints in the ClickHouse Cloud servic 1. On a sidebar find the `Settings` button and click on it. - + 2. Click on `Reverse private endpoints` in a `ClickPipe reverse private endpoints` section. - + Reverse private endpoint extended information is shown in the flyout. diff --git a/docs/integrations/data-ingestion/clickpipes/index.md b/docs/integrations/data-ingestion/clickpipes/index.md index 1f1c9700eb4..de87811bf85 100644 --- a/docs/integrations/data-ingestion/clickpipes/index.md +++ b/docs/integrations/data-ingestion/clickpipes/index.md @@ -48,10 +48,8 @@ import Image from '@theme/IdealImage'; | [Postgres](/integrations/clickpipes/postgres) | |DBMS| Stable | Configure ClickPipes and start ingesting data from Postgres into ClickHouse Cloud. | | [MySQL](/integrations/clickpipes/mysql) | |DBMS| Private Beta | Configure ClickPipes and start ingesting data from MySQL into ClickHouse Cloud. | - More connectors will get added to ClickPipes, you can find out more by [contacting us](https://clickhouse.com/company/contact?loc=clickpipes). - ## List of Static IPs {#list-of-static-ips} The following are the static NAT IPs (separated by region) that ClickPipes uses to connect to your external services. Add your related instance region IPs to your IP allow list to allow traffic. @@ -64,14 +62,14 @@ For all services, ClickPipes traffic will originate from a default region based - **us-west-2**: For services in AWS `us-west-2` created on or after 24 Jun 2025 (services created before this date use `us-east-2` IPs). - **us-east-2**: For all other regions not explicitly listed. (this includes GCP and Azure US regions) -| AWS region | IP Addresses | -|---------------------------------------| ------------------------------------------------------------------------------------------------------------------------------------------------ | -| **eu-central-1** | `18.195.233.217`, `3.127.86.90`, `35.157.23.2`, `18.197.167.47`, `3.122.25.29`, `52.28.148.40` | -| **us-east-1** | `54.82.38.199`, `3.90.133.29`, `52.5.177.8`, `3.227.227.145`, `3.216.6.184`, `54.84.202.92`, `3.131.130.196`, `3.23.172.68`, `3.20.208.150` | -| **us-east-2** | `3.131.130.196`, `3.23.172.68`, `3.20.208.150`, `3.132.20.192`, `18.119.76.110`, `3.134.185.180` | -| **ap-south-1** (from 25 Jun 2025) | `13.203.140.189`, `13.232.213.12`, `13.235.145.208`, `35.154.167.40`, `65.0.39.245`, `65.1.225.89` | -| **ap-southeast-2** (from 25 Jun 2025) | `3.106.48.103`, `52.62.168.142`, `13.55.113.162`, `3.24.61.148`, `54.206.77.184`, `54.79.253.17` | -| **us-west-2** (from 24 Jun 2025) | `52.42.100.5`, `44.242.47.162`, `52.40.44.52`, `44.227.206.163`, `44.246.241.23`, `35.83.230.19` | + | AWS region | IP Addresses | + |---------------------------------------| ------------------------------------------------------------------------------------------------------------------------------------------------ | + | **eu-central-1** | `18.195.233.217`, `3.127.86.90`, `35.157.23.2`, `18.197.167.47`, `3.122.25.29`, `52.28.148.40` | + | **us-east-1** | `54.82.38.199`, `3.90.133.29`, `52.5.177.8`, `3.227.227.145`, `3.216.6.184`, `54.84.202.92`, `3.131.130.196`, `3.23.172.68`, `3.20.208.150` | + | **us-east-2** | `3.131.130.196`, `3.23.172.68`, `3.20.208.150`, `3.132.20.192`, `18.119.76.110`, `3.134.185.180` | + | **ap-south-1** (from 25 Jun 2025) | `13.203.140.189`, `13.232.213.12`, `13.235.145.208`, `35.154.167.40`, `65.0.39.245`, `65.1.225.89` | + | **ap-southeast-2** (from 25 Jun 2025) | `3.106.48.103`, `52.62.168.142`, `13.55.113.162`, `3.24.61.148`, `54.206.77.184`, `54.79.253.17` | + | **us-west-2** (from 24 Jun 2025) | `52.42.100.5`, `44.242.47.162`, `52.40.44.52`, `44.227.206.163`, `44.246.241.23`, `35.83.230.19` | ## Adjusting ClickHouse settings {#adjusting-clickhouse-settings} ClickHouse Cloud provides sensible defaults for most of the use cases. However, if you need to adjust some ClickHouse settings for the ClickPipes destination tables, a dedicated role for ClickPipes is the most flexible solution. @@ -79,7 +77,7 @@ Steps: 1. create a custom role `CREATE ROLE my_clickpipes_role SETTINGS ...`. See [CREATE ROLE](/sql-reference/statements/create/role.md) syntax for details. 2. add the custom role to ClickPipes user on step `Details and Settings` during the ClickPipes creation. - + ## Error reporting {#error-reporting} ClickPipes will store errors in two separate tables depending on the type of error encountered during the ingestion process. @@ -93,16 +91,16 @@ If ClickPipes cannot connect to a data source after 15 min or to a destination a ## FAQ {#faq} - **What is ClickPipes?** - ClickPipes is a ClickHouse Cloud feature that makes it easy for users to connect their ClickHouse services to external data sources, specifically Kafka. With ClickPipes for Kafka, users can easily continuously load data into ClickHouse, making it available for real-time analytics. + ClickPipes is a ClickHouse Cloud feature that makes it easy for users to connect their ClickHouse services to external data sources, specifically Kafka. With ClickPipes for Kafka, users can easily continuously load data into ClickHouse, making it available for real-time analytics. - **Does ClickPipes support data transformation?** - Yes, ClickPipes supports basic data transformation by exposing the DDL creation. You can then apply more advanced transformations to the data as it is loaded into its destination table in a ClickHouse Cloud service leveraging ClickHouse's [materialized views feature](/guides/developer/cascading-materialized-views). + Yes, ClickPipes supports basic data transformation by exposing the DDL creation. You can then apply more advanced transformations to the data as it is loaded into its destination table in a ClickHouse Cloud service leveraging ClickHouse's [materialized views feature](/guides/developer/cascading-materialized-views). - **Does using ClickPipes incur an additional cost?** - ClickPipes is billed on two dimensions: Ingested Data and Compute. The full details of the pricing are available on [this page](/cloud/manage/jan-2025-faq/pricing-dimensions#clickpipes-pricing-faq). Running ClickPipes might also generate an indirect compute and storage cost on the destination ClickHouse Cloud service similar to any ingest workload. + ClickPipes is billed on two dimensions: Ingested Data and Compute. The full details of the pricing are available on [this page](/cloud/manage/jan-2025-faq/pricing-dimensions#clickpipes-pricing-faq). Running ClickPipes might also generate an indirect compute and storage cost on the destination ClickHouse Cloud service similar to any ingest workload. - **Is there a way to handle errors or failures when using ClickPipes for Kafka?** - Yes, ClickPipes for Kafka will automatically retry in the event of failures when consuming data from Kafka for any operational issue including network issues, connectivity issues, etc. In the event of malformed data or invalid schema, ClickPipes will store the record in the record_error table and continue processing. + Yes, ClickPipes for Kafka will automatically retry in the event of failures when consuming data from Kafka for any operational issue including network issues, connectivity issues, etc. In the event of malformed data or invalid schema, ClickPipes will store the record in the record_error table and continue processing. diff --git a/docs/integrations/data-ingestion/clickpipes/kafka/01_create-kafka-clickpipe.md b/docs/integrations/data-ingestion/clickpipes/kafka/01_create-kafka-clickpipe.md index 7f4b45eb00f..9c68def35cf 100644 --- a/docs/integrations/data-ingestion/clickpipes/kafka/01_create-kafka-clickpipe.md +++ b/docs/integrations/data-ingestion/clickpipes/kafka/01_create-kafka-clickpipe.md @@ -21,51 +21,34 @@ import Image from '@theme/IdealImage'; > In this guide, we will walk you through the process of creating your first Kafka ClickPipe. - ## Navigate to data sources {#1-load-sql-console} Select the `Data Sources` button on the left-side menu and click on "Set up a ClickPipe". - ## Select a data source {#2-select-data-source} Select your Kafka data source from the list. - ## Configure the data source {#3-configure-data-source} Fill out the form by providing your ClickPipe with a name, a description (optional), your credentials, and other connection details. - ## Configure a schema registry (optional) {#4-configure-your-schema-registry} A valid schema is required for Avro streams. See [Schema registries](./02_schema-registries.md) for more details on how to configure a schema registry. - ## Configure a reverse private endpoint (optional) {#5-configure-reverse-private-endpoint} Configure a Reverse Private Endpoint to allow ClickPipes to connect to your Kafka cluster using AWS PrivateLink. See our [AWS PrivateLink documentation](../aws-privatelink.md) for more information. - ## Select your topic {#6-select-your-topic} Select your topic and the UI will display a sample document from the topic. - ## Configure your destination table {#7-configure-your-destination-table} - In the next step, you can select whether you want to ingest data into a new ClickHouse table or reuse an existing one. Follow the instructions in the screen to modify your table name, schema, and settings. You can see a real-time preview of your changes in the sample table at the top. - - You can also customize the advanced settings using the controls provided - - - ## Configure permissions {#8-configure-permissions} ClickPipes will create a dedicated user for writing data into a destination table. You can select a role for this internal user using a custom role or one of the predefined role: - `Full access`: with the full access to the cluster. It might be useful if you use Materialized View or Dictionary with the destination table. - `Only destination table`: with the `INSERT` permissions to the destination table only. - - ## Complete setup {#9-complete-setup} Clicking on "Create ClickPipe" will create and run your ClickPipe. It will now be listed in the Data Sources section. - - diff --git a/docs/integrations/data-ingestion/clickpipes/kafka/02_schema-registries.md b/docs/integrations/data-ingestion/clickpipes/kafka/02_schema-registries.md index 6db3464799b..ab1ffee6722 100644 --- a/docs/integrations/data-ingestion/clickpipes/kafka/02_schema-registries.md +++ b/docs/integrations/data-ingestion/clickpipes/kafka/02_schema-registries.md @@ -18,7 +18,7 @@ Schema registries that use the Confluent Schema Registry API are supported. This - AWS MSK - Upstash -ClickPipes is not currently compatible with the AWS Glue Schema registry or the Azure Schema Registry. + ClickPipes is not currently compatible with the AWS Glue Schema registry or the Azure Schema Registry. ## Configuration {#schema-registry-configuration} @@ -44,5 +44,3 @@ The following rules are applied to the mapping between the retrieved Avro schema - If the Avro schema contains a field that is not included in the ClickHouse destination mapping, that field is ignored. - If the Avro schema is missing a field defined in the ClickHouse destination mapping, the ClickHouse column will be populated with a "zero" value, such as 0 or an empty string. Note that DEFAULT expressions are not currently evaluated for ClickPipes inserts (this is temporary limitation pending updates to the ClickHouse server default processing). - If the Avro schema field and the ClickHouse column are incompatible, inserts of that row/message will fail, and the failure will be recorded in the ClickPipes errors table. Note that several implicit conversions are supported (like between numeric types), but not all (for example, an Avro record field can not be inserted into an Int32 ClickHouse column). - - diff --git a/docs/integrations/data-ingestion/clickpipes/kafka/03_reference.md b/docs/integrations/data-ingestion/clickpipes/kafka/03_reference.md index 79d928e4cdf..c559536e69b 100644 --- a/docs/integrations/data-ingestion/clickpipes/kafka/03_reference.md +++ b/docs/integrations/data-ingestion/clickpipes/kafka/03_reference.md @@ -28,7 +28,6 @@ import ExperimentalBadge from '@site/src/theme/badges/ExperimentalBadge'; | Azure Event Hubs ||Streaming| Stable | Configure ClickPipes and start ingesting streaming data from Azure Event Hubs into ClickHouse Cloud. | | WarpStream ||Streaming| Stable | Configure ClickPipes and start ingesting streaming data from WarpStream into ClickHouse Cloud. | - ## Supported data formats {#supported-data-formats} The supported formats are: @@ -82,10 +81,10 @@ have to submit a support ticket to enable it on your service. ClickPipes supports the Variant type in the following circumstances: - Avro Unions. If your Avro schema contains a union with multiple non-null types, ClickPipes will infer the - appropriate variant type. Variant types are not otherwise supported for Avro data. + appropriate variant type. Variant types are not otherwise supported for Avro data. - JSON fields. You can manually specify a Variant type (such as `Variant(String, Int64, DateTime)`) for any JSON field - in the source data stream. Because of the way ClickPipes determines the correct variant subtype to use, only one integer or datetime - type can be used in the Variant definition - for example, `Variant(Int64, UInt32)` is not supported. + in the source data stream. Because of the way ClickPipes determines the correct variant subtype to use, only one integer or datetime + type can be used in the Variant definition - for example, `Variant(Int64, UInt32)` is not supported. #### JSON type support {#json-type-support} @@ -99,7 +98,7 @@ ClickPipes support the JSON type in the following circumstances: - Avro String and Bytes types can be assigned to a JSON column if the column actually holds JSON String objects. - JSON fields that are always a JSON object can be assigned to a JSON destination column. -Note that you will have to manually change the destination column to the desired JSON type, including any fixed or skipped paths. + Note that you will have to manually change the destination column to the desired JSON type, including any fixed or skipped paths. ## Kafka virtual columns {#kafka-virtual-columns} @@ -116,6 +115,6 @@ The following virtual columns are supported for Kafka compatible streaming data | `_header_values` | Parallel array of headers in the record Headers | `Array(String)` | | `_raw_message` | Full Kafka Message | `String` | -Note that the `_raw_message` column is only recommended for JSON data. +Note that the `_raw_message` column is only recommended for JSON data. For use cases where only the JSON string is required (such as using ClickHouse [`JsonExtract*`](/sql-reference/functions/json-functions#jsonextract-functions) functions to populate a downstream materialized view), it may improve ClickPipes performance to delete all the "non-virtual" columns. diff --git a/docs/integrations/data-ingestion/clickpipes/kafka/04_best_practices.md b/docs/integrations/data-ingestion/clickpipes/kafka/04_best_practices.md index 93a00bf41f4..441b1845293 100644 --- a/docs/integrations/data-ingestion/clickpipes/kafka/04_best_practices.md +++ b/docs/integrations/data-ingestion/clickpipes/kafka/04_best_practices.md @@ -34,11 +34,11 @@ ClickPipes supports the following AWS MSK authentication - [SASL/SCRAM-SHA-512](https://docs.aws.amazon.com/msk/latest/developerguide/msk-password.html) authentication - [IAM Credentials or Role-based access](https://docs.aws.amazon.com/msk/latest/developerguide/how-to-use-iam-access-control.html) authentication -When using IAM authentication to connect to an MSK broker, the IAM role must have the necessary permissions. -Below is an example of the required IAM policy for Apache Kafka APIs for MSK: + When using IAM authentication to connect to an MSK broker, the IAM role must have the necessary permissions. + Below is an example of the required IAM policy for Apache Kafka APIs for MSK: -```json -{ + ```json + { "Version": "2012-10-17", "Statement": [ { @@ -71,8 +71,8 @@ Below is an example of the required IAM policy for Apache Kafka APIs for MSK: ] } ] -} -``` + } + ``` #### Configuring a trusted relationship {#configuring-a-trusted-relationship} diff --git a/docs/integrations/data-ingestion/clickpipes/kafka/05_faq.md b/docs/integrations/data-ingestion/clickpipes/kafka/05_faq.md index ae3a113373e..baa1f36326e 100644 --- a/docs/integrations/data-ingestion/clickpipes/kafka/05_faq.md +++ b/docs/integrations/data-ingestion/clickpipes/kafka/05_faq.md @@ -6,126 +6,92 @@ sidebar_position: 1 title: 'Kafka ClickPipes FAQ' --- - ## Kafka ClickPipes FAQ {#faq} ### General {#general}
- How does ClickPipes for Kafka work? - ClickPipes uses a dedicated architecture running the Kafka Consumer API to read data from a specified topic and then inserts the data into a ClickHouse table on a specific ClickHouse Cloud service.
- What's the difference between ClickPipes and the ClickHouse Kafka Table Engine? - The Kafka Table engine is a ClickHouse core capability that implements a "pull model" where the ClickHouse server itself connects to Kafka, pulls events then writes them locally. - ClickPipes is a separate cloud service that runs independently of the ClickHouse Service, it connects to Kafka (or other data sources) and pushes events to an associated ClickHouse Cloud service. This decoupled architecture allows for superior operational flexibility, clear separation of concerns, scalable ingestion, graceful failure management, extensibility and more.
- What are the requirements for using ClickPipes for Kafka? - In order to use ClickPipes for Kafka, you will need a running Kafka broker and a ClickHouse Cloud service with ClickPipes enabled. You will also need to ensure that ClickHouse Cloud can access your Kafka broker. This can be achieved by allowing remote connection on the Kafka side, whitelisting [ClickHouse Cloud Egress IP addresses](/manage/security/cloud-endpoints-api) in your Kafka setup. Alternatively, you can use [AWS PrivateLink](/integrations/clickpipes/aws-privatelink) to connect ClickPipes for Kafka to your Kafka brokers.
- Does ClickPipes for Kafka support AWS PrivateLink? - AWS PrivateLink is supported. See [the documentation](/integrations/clickpipes/aws-privatelink) for more information on how to set it up.
- Can I use ClickPipes for Kafka to write data to a Kafka topic? - No, the ClickPipes for Kafka is designed for reading data from Kafka topics, not writing data to them. To write data to a Kafka topic, you will need to use a dedicated Kafka producer.
- Does ClickPipes support multiple brokers? - Yes, if the brokers are part of the same quorum they can be configured together delimited with `,`.
### Upstash {#upstash}
- Does ClickPipes support Upstash? - Yes. The Upstash Kafka product entered into a deprecation period on 11th September 2024 for 6 months. Existing customers can continue to use ClickPipes with their existing Upstash Kafka brokers using the generic Kafka tile on the ClickPipes user interface. Existing Upstash Kafka ClickPipes are unaffected before the deprecation notice. When the the deprecation period is up the ClickPipe will stop functioning.
- Does ClickPipes support Upstash schema registry? - No. ClickPipes is not Upstash Kafka schema registry compatible.
- Does ClickPipes support the Upstash QStash Workflow? - No. Unless a Kafka compatible surface is introduced in QStash Workflow it will not work with Kafka ClickPipes.
### Azure EventHubs {#azure-eventhubs}
- Does the Azure Event Hubs ClickPipe work without the Kafka surface? - No. ClickPipes requires the Azure Event Hubs to have the Kafka surface enabled. The Kafka protocol is supported for their Standard, Premium and Dedicated SKU only pricing tiers.
- Does Azure schema registry work with ClickPipes - No. ClickPipes is not currently Event Hubs Schema Registry compatible.
- What permissions does my policy need to consume from Azure Event Hubs? - To list topics and consume event, the shared access policy that is given to ClickPipes will at minimum require a 'Listen' claim.
- Why is my Event Hubs not returning any data? - If your ClickHouse instance is in a different region or continent from your Event Hubs deployment, you may experience timeouts when onboarding your ClickPipes, and higher-latency when consuming data from the Event Hub. It is considered a best practice to locate your ClickHouse Cloud deployment and Azure Event Hubs deployment in cloud regions located close to each other to avoid adverse performance.
- Should I include the port number for Azure Event Hubs? - Yes. ClickPipes expects you to include your port number for the Kafka surface, which should be `:9093`.
- Are the ClickPipes IPs still relevant for Azure Event Hubs? - Yes. If you restrict traffic to your Event Hubs instance please add the [documented static NAT IPs](../ /index.md#list-of-static-ips). -
Is the connection string for the Event Hub, or is it for the Event Hub namespace? - Both will work, however, we recommend using a shared access policy at the namespace level to retrieve samples from multiple Event Hubs.
diff --git a/docs/integrations/data-ingestion/clickpipes/kafka/index.md b/docs/integrations/data-ingestion/clickpipes/kafka/index.md index bd9ed9e649a..591a319cc5b 100644 --- a/docs/integrations/data-ingestion/clickpipes/kafka/index.md +++ b/docs/integrations/data-ingestion/clickpipes/kafka/index.md @@ -13,4 +13,4 @@ title: 'Kafka ClickPipes' | [Creating your first Kafka ClickPipe](/integrations/clickpipes/kafka/create-your-first-kafka-clickpipe) | Step-by-step guide to creating your first Kafka ClickPipe. | | [Kafka ClickPipes FAQ](/integrations/clickpipes/kafka/faq) | Details best practices to follow when working with Kafka ClickPipes | | [Best practices](/integrations/clickpipes/kafka/best-practices) | Details best practices to follow when working with Kafka ClickPipes | - \ No newline at end of file + diff --git a/docs/integrations/data-ingestion/clickpipes/kinesis.md b/docs/integrations/data-ingestion/clickpipes/kinesis.md index 396e29208d8..23de3ecebf4 100644 --- a/docs/integrations/data-ingestion/clickpipes/kinesis.md +++ b/docs/integrations/data-ingestion/clickpipes/kinesis.md @@ -20,7 +20,6 @@ import cp_destination from '@site/static/images/integrations/data-ingestion/clic import cp_overview from '@site/static/images/integrations/data-ingestion/clickpipes/cp_overview.png'; import Image from '@theme/IdealImage'; - # Integrating Amazon Kinesis with ClickHouse Cloud ## Prerequisite {#prerequisite} You have familiarized yourself with the [ClickPipes intro](./index.md) and setup [IAM credentials](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html) or an [IAM Role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html). Follow the [Kinesis Role-Based Access guide](./secure-kinesis.md) for information on how to setup a role that works with ClickHouse Cloud. @@ -29,61 +28,60 @@ You have familiarized yourself with the [ClickPipes intro](./index.md) and setup 1. Access the SQL Console for your ClickHouse Cloud Service. - + 2. Select the `Data Sources` button on the left-side menu and click on "Set up a ClickPipe" - + 3. Select your data source. - + 4. Fill out the form by providing your ClickPipe with a name, a description (optional), your IAM role or credentials, and other connection details. - + 5. Select Kinesis Stream and starting offset. The UI will display a sample document from the selected source (Kafka topic, etc). You can also enable Enhanced Fan-out for Kinesis streams to improve the performance and stability of your ClickPipe (More information on Enhanced Fan-out can be found [here](https://aws.amazon.com/blogs/aws/kds-enhanced-fanout)) - + 6. In the next step, you can select whether you want to ingest data into a new ClickHouse table or reuse an existing one. Follow the instructions in the screen to modify your table name, schema, and settings. You can see a real-time preview of your changes in the sample table at the top. - + - You can also customize the advanced settings using the controls provided + You can also customize the advanced settings using the controls provided - + 7. Alternatively, you can decide to ingest your data in an existing ClickHouse table. In that case, the UI will allow you to map fields from the source to the ClickHouse fields in the selected destination table. - + 8. Finally, you can configure permissions for the internal ClickPipes user. - **Permissions:** ClickPipes will create a dedicated user for writing data into a destination table. You can select a role for this internal user using a custom role or one of the predefined role: + **Permissions:** ClickPipes will create a dedicated user for writing data into a destination table. You can select a role for this internal user using a custom role or one of the predefined role: - `Full access`: with the full access to the cluster. It might be useful if you use materialized view or Dictionary with the destination table. - `Only destination table`: with the `INSERT` permissions to the destination table only. - + 9. By clicking on "Complete Setup", the system will register you ClickPipe, and you'll be able to see it listed in the summary table. - + - + - The summary table provides controls to display sample data from the source or the destination table in ClickHouse + The summary table provides controls to display sample data from the source or the destination table in ClickHouse - + - As well as controls to remove the ClickPipe and display a summary of the ingest job. + As well as controls to remove the ClickPipe and display a summary of the ingest job. - + 10. **Congratulations!** you have successfully set up your first ClickPipe. If this is a streaming ClickPipe it will be continuously running, ingesting data in real-time from your remote data source. Otherwise it will ingest the batch and complete. - ## Supported data formats {#supported-data-formats} The supported formats are: @@ -123,7 +121,7 @@ JSON type support is automatic if your Cloud service is running ClickHouse 25.3 have to submit a support ticket to enable it on your service. JSON fields that are always a JSON object can be assigned to a JSON destination column. You will have to manually change the destination -column to the desired JSON type, including any fixed or skipped paths. +column to the desired JSON type, including any fixed or skipped paths. ## Kinesis virtual columns {#kinesis-virtual-columns} diff --git a/docs/integrations/data-ingestion/clickpipes/mysql/index.md b/docs/integrations/data-ingestion/clickpipes/mysql/index.md index 1b0a2a00a9f..ebd17984b99 100644 --- a/docs/integrations/data-ingestion/clickpipes/mysql/index.md +++ b/docs/integrations/data-ingestion/clickpipes/mysql/index.md @@ -23,7 +23,6 @@ import Image from '@theme/IdealImage'; Currently, ingesting data from MySQL to ClickHouse Cloud via ClickPipes is in Private Preview. ::: - You can use ClickPipes to ingest data from your source MySQL database into ClickHouse Cloud. The source MySQL database can be hosted on-premises or in the cloud. ## Prerequisites {#prerequisites} @@ -42,7 +41,7 @@ To get started, you first need to make sure that your MySQL database is set up c 6. [Generic MariaDB](./mysql/source/generic_maria) -Once your source MySQL database is set up, you can continue creating your ClickPipe. + Once your source MySQL database is set up, you can continue creating your ClickPipe. ## Create your ClickPipe {#creating-your-clickpipe} @@ -51,49 +50,48 @@ Make sure you are logged in to your ClickHouse Cloud account. If you don't have [//]: # ( TODO update image here) 1. In the ClickHouse Cloud console, navigate to your ClickHouse Cloud Service. - + 2. Select the `Data Sources` button on the left-side menu and click on "Set up a ClickPipe" - + 3. Select the `MySQL CDC` tile - + ### Add your source MySQL database connection {#adding-your-source-mysql-database-connection} 4. Fill in the connection details for your source MySQL database which you configured in the prerequisites step. - :::info + :::info - Before you start adding your connection details make sure that you have whitelisted ClickPipes IP addresses in your firewall rules. On the following page you can find a [list of ClickPipes IP addresses](../index.md#list-of-static-ips). - For more information refer to the source MySQL setup guides linked at [the top of this page](#prerequisites). + Before you start adding your connection details make sure that you have whitelisted ClickPipes IP addresses in your firewall rules. On the following page you can find a [list of ClickPipes IP addresses](../index.md#list-of-static-ips). + For more information refer to the source MySQL setup guides linked at [the top of this page](#prerequisites). - ::: + ::: - + #### (Optional) Set up SSH tunneling {#optional-setting-up-ssh-tunneling} You can specify SSH tunneling details if your source MySQL database is not publicly accessible. - 1. Enable the "Use SSH Tunnelling" toggle. 2. Fill in the SSH connection details. - + 3. To use Key-based authentication, click on "Revoke and generate key pair" to generate a new key pair and copy the generated public key to your SSH server under `~/.ssh/authorized_keys`. 4. Click on "Verify Connection" to verify the connection. -:::note + :::note -Make sure to whitelist [ClickPipes IP addresses](../clickpipes#list-of-static-ips) in your firewall rules for the SSH bastion host so that ClickPipes can establish the SSH tunnel. + Make sure to whitelist [ClickPipes IP addresses](../clickpipes#list-of-static-ips) in your firewall rules for the SSH bastion host so that ClickPipes can establish the SSH tunnel. -::: + ::: -Once the connection details are filled in, click on "Next". + Once the connection details are filled in, click on "Next". #### Configure advanced settings {#advanced-settings} @@ -105,12 +103,11 @@ You can configure the advanced settings if needed. A brief description of each s - **Snapshot number of rows per partition**: This is the number of rows that will be fetched in each partition during the initial snapshot. This is useful when you have a large number of rows in your tables and you want to control the number of rows fetched in each partition. - **Snapshot number of tables in parallel**: This is the number of tables that will be fetched in parallel during the initial snapshot. This is useful when you have a large number of tables and you want to control the number of tables fetched in parallel. - ### Configure the tables {#configuring-the-tables} 5. Here you can select the destination database for your ClickPipe. You can either select an existing database or create a new one. - + 6. You can select the tables you want to replicate from the source MySQL database. While selecting the tables, you can also choose to rename the tables in the destination ClickHouse database as well as exclude specific columns. @@ -118,6 +115,6 @@ You can configure the advanced settings if needed. A brief description of each s 7. Select the "Full access" role from the permissions dropdown and click "Complete Setup". - + -Finally, please refer to the ["ClickPipes for MySQL FAQ"](/integrations/clickpipes/mysql/faq) page for more information about common issues and how to resolve them. + Finally, please refer to the ["ClickPipes for MySQL FAQ"](/integrations/clickpipes/mysql/faq) page for more information about common issues and how to resolve them. diff --git a/docs/integrations/data-ingestion/clickpipes/mysql/source/aurora.md b/docs/integrations/data-ingestion/clickpipes/mysql/source/aurora.md index 395d7e18feb..5872e516122 100644 --- a/docs/integrations/data-ingestion/clickpipes/mysql/source/aurora.md +++ b/docs/integrations/data-ingestion/clickpipes/mysql/source/aurora.md @@ -57,21 +57,20 @@ The following settings need to be set as follows: 1. `binlog_format` to `ROW`. - + 2. `binlog_row_metadata` to `FULL` - + 3. `binlog_row_image` to `FULL` - + -Then click on `Save Changes` in the top-right. You may need to reboot your instance for the changes to take effect - a way of knowing this is if you see `Pending reboot` next to the parameter group link in the Configurations tab of the RDS instance. -
-:::tip -If you have a MySQL cluster, the above parameters would be found in a [DB Cluster](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/USER_WorkingWithParamGroups.CreatingCluster.html) parameter group and not the DB instance group. -::: + Then click on `Save Changes` in the top-right. You may need to reboot your instance for the changes to take effect - a way of knowing this is if you see `Pending reboot` next to the parameter group link in the Configurations tab of the RDS instance. + :::tip + If you have a MySQL cluster, the above parameters would be found in a [DB Cluster](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/USER_WorkingWithParamGroups.CreatingCluster.html) parameter group and not the DB instance group. + ::: ## Enabling GTID mode {#gtid-mode-aurora} Global Transaction Identifiers (GTIDs) are unique IDs assigned to each committed transaction in MySQL. They simplify binlog replication and make troubleshooting more straightforward. @@ -88,12 +87,11 @@ To enable GTID mode for your MySQL instance, follow the steps as follows: 7. Click on `Save Changes` in the top-right corner. 8. Reboot your instance for the changes to take effect. - + -
-:::info -The MySQL ClickPipe also supports replication without GTID mode. However, enabling GTID mode is recommended for better performance and easier troubleshooting. -::: + :::info + The MySQL ClickPipe also supports replication without GTID mode. However, enabling GTID mode is recommended for better performance and easier troubleshooting. + ::: ## Configure a database user {#configure-database-user-aurora} diff --git a/docs/integrations/data-ingestion/clickpipes/mysql/source/gcp.md b/docs/integrations/data-ingestion/clickpipes/mysql/source/gcp.md index 3a87882f74c..2ded7aff40e 100644 --- a/docs/integrations/data-ingestion/clickpipes/mysql/source/gcp.md +++ b/docs/integrations/data-ingestion/clickpipes/mysql/source/gcp.md @@ -32,12 +32,12 @@ If not already configured, make sure to set these in the database flags section 2. `binlog_row_metadata` to `FULL` 3. `binlog_row_image` to `FULL` -To do this, click on the `Edit` button in the top right corner of the instance overview page. - + To do this, click on the `Edit` button in the top right corner of the instance overview page. + -Then scroll down to the `Flags` section and add the above flags. + Then scroll down to the `Flags` section and add the above flags. - + ## Configure a database user {#configure-database-user-gcp} @@ -77,8 +77,8 @@ To connect to your Cloud SQL instance, you need to download the root CA certific 3. Click on the `Security` tab. 4. In the `Manage server CA certificates` section, click on the `DOWNLOAD CERTIFICATES` button at the bottom. - + 5. In the ClickPipes UI, upload the downloaded certificate when creating a new MySQL ClickPipe. - + diff --git a/docs/integrations/data-ingestion/clickpipes/mysql/source/generic.md b/docs/integrations/data-ingestion/clickpipes/mysql/source/generic.md index 438cda38455..d70a2c12f9f 100644 --- a/docs/integrations/data-ingestion/clickpipes/mysql/source/generic.md +++ b/docs/integrations/data-ingestion/clickpipes/mysql/source/generic.md @@ -113,11 +113,11 @@ Connect to your MySQL instance as the root user and execute the following comman GRANT REPLICATION SLAVE ON *.* TO 'clickpipes_user'@'%'; ``` -:::note + :::note -Make sure to replace `clickpipes_user` and `some_secure_password` with your desired username and password. + Make sure to replace `clickpipes_user` and `some_secure_password` with your desired username and password. -::: + ::: ## SSL/TLS configuration (recommended) {#ssl-tls-configuration} @@ -136,4 +136,4 @@ For more information on SSL/TLS options, check out our [FAQ](https://clickhouse. ## What's next? {#whats-next} You can now [create your ClickPipe](../index.md) and start ingesting data from your MySQL instance into ClickHouse Cloud. -Make sure to note down the connection details you used while setting up your MySQL instance as you will need them during the ClickPipe creation process. \ No newline at end of file +Make sure to note down the connection details you used while setting up your MySQL instance as you will need them during the ClickPipe creation process. diff --git a/docs/integrations/data-ingestion/clickpipes/mysql/source/generic_maria.md b/docs/integrations/data-ingestion/clickpipes/mysql/source/generic_maria.md index 7099425aae4..9cf5cc0766f 100644 --- a/docs/integrations/data-ingestion/clickpipes/mysql/source/generic_maria.md +++ b/docs/integrations/data-ingestion/clickpipes/mysql/source/generic_maria.md @@ -82,11 +82,11 @@ Connect to your MariaDB instance as the root user and execute the following comm GRANT REPLICATION SLAVE ON *.* TO 'clickpipes_user'@'%'; ``` -:::note + :::note -Make sure to replace `clickpipes_user` and `some_secure_password` with your desired username and password. + Make sure to replace `clickpipes_user` and `some_secure_password` with your desired username and password. -::: + ::: ## SSL/TLS configuration (recommended) {#ssl-tls-configuration} @@ -105,4 +105,4 @@ For more information on SSL/TLS options, check out our [FAQ](https://clickhouse. ## What's next? {#whats-next} You can now [create your ClickPipe](../index.md) and start ingesting data from your MariaDB instance into ClickHouse Cloud. -Make sure to note down the connection details you used while setting up your MariaDB instance as you will need them during the ClickPipe creation process. \ No newline at end of file +Make sure to note down the connection details you used while setting up your MariaDB instance as you will need them during the ClickPipe creation process. diff --git a/docs/integrations/data-ingestion/clickpipes/mysql/source/rds.md b/docs/integrations/data-ingestion/clickpipes/mysql/source/rds.md index e3267954868..34eda60b729 100644 --- a/docs/integrations/data-ingestion/clickpipes/mysql/source/rds.md +++ b/docs/integrations/data-ingestion/clickpipes/mysql/source/rds.md @@ -58,22 +58,21 @@ The following settings need to be set as follows: 1. `binlog_format` to `ROW`. - + 2. `binlog_row_metadata` to `FULL` - + 3. `binlog_row_image` to `FULL` - + -Then click on `Save Changes` in the top-right. You may need to reboot your instance for the changes to take effect - a way of knowing this is if you see `Pending reboot` next to the parameter group link in the Configurations tab of the RDS instance. + Then click on `Save Changes` in the top-right. You may need to reboot your instance for the changes to take effect - a way of knowing this is if you see `Pending reboot` next to the parameter group link in the Configurations tab of the RDS instance. -
-:::tip -If you have a MySQL cluster, the above parameters would be found in a [DB Cluster](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/USER_WorkingWithParamGroups.CreatingCluster.html) parameter group and not the DB instance group. -::: + :::tip + If you have a MySQL cluster, the above parameters would be found in a [DB Cluster](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/USER_WorkingWithParamGroups.CreatingCluster.html) parameter group and not the DB instance group. + ::: ## Enabling GTID Mode {#gtid-mode-rds} Global Transaction Identifiers (GTIDs) are unique IDs assigned to each committed transaction in MySQL. They simplify binlog replication and make troubleshooting more straightforward. @@ -90,13 +89,11 @@ To enable GTID mode for your MySQL instance, follow the steps as follows: 7. Click on `Save Changes` in the top-right corner. 8. Reboot your instance for the changes to take effect. - - -
-:::tip -The MySQL ClickPipe also supports replication without GTID mode. However, enabling GTID mode is recommended for better performance and easier troubleshooting. -::: + + :::tip + The MySQL ClickPipe also supports replication without GTID mode. However, enabling GTID mode is recommended for better performance and easier troubleshooting. + ::: ## Configure a database user {#configure-database-user-rds} diff --git a/docs/integrations/data-ingestion/clickpipes/mysql/source/rds_maria.md b/docs/integrations/data-ingestion/clickpipes/mysql/source/rds_maria.md index 4a5a233fb66..0ac69942971 100644 --- a/docs/integrations/data-ingestion/clickpipes/mysql/source/rds_maria.md +++ b/docs/integrations/data-ingestion/clickpipes/mysql/source/rds_maria.md @@ -57,22 +57,21 @@ Settings `binlog_format`, `binlog_row_metadata` and `binlog_row_image` need to b 1. `binlog_format` to `ROW`. - + 2. `binlog_row_metadata` to `FULL` - + 3. `binlog_row_image` to `FULL` - + -Next, click on `Save Changes` in the top-right. You may need to reboot your instance for the changes to take effect. If you see `Pending reboot` next to the parameter group link in the Configurations tab of the RDS instance, this is a good indication that a reboot of your instance is needed. + Next, click on `Save Changes` in the top-right. You may need to reboot your instance for the changes to take effect. If you see `Pending reboot` next to the parameter group link in the Configurations tab of the RDS instance, this is a good indication that a reboot of your instance is needed. -
-:::tip -If you have a MariaDB cluster, the above parameters would be found in a [DB Cluster](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/USER_WorkingWithParamGroups.CreatingCluster.html) parameter group and not the DB instance group. -::: + :::tip + If you have a MariaDB cluster, the above parameters would be found in a [DB Cluster](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/USER_WorkingWithParamGroups.CreatingCluster.html) parameter group and not the DB instance group. + ::: ## Enabling GTID Mode {#gtid-mode-rds} Global Transaction Identifiers (GTIDs) are unique IDs assigned to each committed transaction in MySQL/MariaDB. They simplify binlog replication and make troubleshooting more straightforward. MariaDB enables GTID mode by default, so no user action is needed to use it. @@ -112,4 +111,3 @@ If you want to restrict traffic to your RDS instance, please add the [documented ### Private access via AWS PrivateLink {#private-access-via-aws-privatelink} To connect to your RDS instance through a private network, you can use AWS PrivateLink. Follow our [AWS PrivateLink setup guide for ClickPipes](/knowledgebase/aws-privatelink-setup-for-clickpipes) to set up the connection. - diff --git a/docs/integrations/data-ingestion/clickpipes/object-storage.md b/docs/integrations/data-ingestion/clickpipes/object-storage.md index 22d0a10102c..17396aeecca 100644 --- a/docs/integrations/data-ingestion/clickpipes/object-storage.md +++ b/docs/integrations/data-ingestion/clickpipes/object-storage.md @@ -26,7 +26,6 @@ import Image from '@theme/IdealImage'; # Integrating object storage with ClickHouse Cloud Object Storage ClickPipes provide a simple and resilient way to ingest data from Amazon S3, Google Cloud Storage, Azure Blob Storage, and DigitalOcean Spaces into ClickHouse Cloud. Both one-time and continuous ingestion are supported with exactly-once semantics. - ## Prerequisite {#prerequisite} You have familiarized yourself with the [ClickPipes intro](./index.md). @@ -34,59 +33,59 @@ You have familiarized yourself with the [ClickPipes intro](./index.md). 1. In the cloud console, select the `Data Sources` button on the left-side menu and click on "Set up a ClickPipe" - + 2. Select your data source. - + 3. Fill out the form by providing your ClickPipe with a name, a description (optional), your IAM role or credentials, and bucket URL. You can specify multiple files using bash-like wildcards. For more information, [see the documentation on using wildcards in path](#limitations). - + 4. The UI will display a list of files in the specified bucket. Select your data format (we currently support a subset of ClickHouse formats) and if you want to enable continuous ingestion [More details below](#continuous-ingest). - + 5. In the next step, you can select whether you want to ingest data into a new ClickHouse table or reuse an existing one. Follow the instructions in the screen to modify your table name, schema, and settings. You can see a real-time preview of your changes in the sample table at the top. - + - You can also customize the advanced settings using the controls provided + You can also customize the advanced settings using the controls provided - + 6. Alternatively, you can decide to ingest your data in an existing ClickHouse table. In that case, the UI will allow you to map fields from the source to the ClickHouse fields in the selected destination table. - + -:::info -You can also map [virtual columns](../../sql-reference/table-functions/s3#virtual-columns), like `_path` or `_size`, to fields. -::: + :::info + You can also map [virtual columns](../../sql-reference/table-functions/s3#virtual-columns), like `_path` or `_size`, to fields. + ::: 7. Finally, you can configure permissions for the internal ClickPipes user. - **Permissions:** ClickPipes will create a dedicated user for writing data into a destination table. You can select a role for this internal user using a custom role or one of the predefined role: + **Permissions:** ClickPipes will create a dedicated user for writing data into a destination table. You can select a role for this internal user using a custom role or one of the predefined role: - `Full access`: with the full access to the cluster. Required if you use materialized view or Dictionary with the destination table. - `Only destination table`: with the `INSERT` permissions to the destination table only. - + 8. By clicking on "Complete Setup", the system will register you ClickPipe, and you'll be able to see it listed in the summary table. - + - + - The summary table provides controls to display sample data from the source or the destination table in ClickHouse + The summary table provides controls to display sample data from the source or the destination table in ClickHouse - + - As well as controls to remove the ClickPipe and display a summary of the ingest job. + As well as controls to remove the ClickPipe and display a summary of the ingest job. - + -Image + Image 9. **Congratulations!** you have successfully set up your first ClickPipe. If this is a streaming ClickPipe it will be continuously running, ingesting data in real-time from your remote data source. Otherwise it will ingest the batch and complete. ## Supported data sources {#supported-data-sources} @@ -130,20 +129,19 @@ To increase the throughput on large ingest jobs, we recommend scaling the ClickH - ClickPipes will only attempt to ingest objects at 10GB or smaller in size. If a file is greater than 10GB an error will be appended to the ClickPipes dedicated error table. - Azure Blob Storage pipes with continuous ingest on containers with over 100k files will have a latency of around 10–15 seconds in detecting new files. Latency increases with file count. - S3 / GCS ClickPipes **does not** share a listing syntax with the [S3 Table Function](/sql-reference/table-functions/s3), nor Azure with the [AzureBlobStorage Table function](/sql-reference/table-functions/azureBlobStorage). - - `?` — Substitutes any single character - - `*` — Substitutes any number of any characters except / including empty string - - `**` — Substitutes any number of any character include / including empty string + - `?` — Substitutes any single character + - `*` — Substitutes any number of any characters except / including empty string + - `**` — Substitutes any number of any character include / including empty string -:::note -This is a valid path (for S3): + :::note + This is a valid path (for S3): -https://datasets-documentation.s3.eu-west-3.amazonaws.com/http/**.ndjson.gz + https://datasets-documentation.s3.eu-west-3.amazonaws.com/http/**.ndjson.gz + This is not a valid path. `{N..M}` are not supported in ClickPipes. -This is not a valid path. `{N..M}` are not supported in ClickPipes. - -https://datasets-documentation.s3.eu-west-3.amazonaws.com/http/{documents-01,documents-02}.ndjson.gz -::: + https://datasets-documentation.s3.eu-west-3.amazonaws.com/http/{documents-01,documents-02}.ndjson.gz + ::: ## Continuous Ingest {#continuous-ingest} ClickPipes supports continuous ingestion from S3, GCS, Azure Blob Storage, and DigitalOcean Spaces. When enabled, ClickPipes continuously ingests data from the specified path, and polls for new files at a rate of once every 30 seconds. However, new files must be lexically greater than the last ingested file. This means that they must be named in a way that defines the ingestion order. For instance, files named `file1`, `file2`, `file3`, etc., will be ingested sequentially. If a new file is added with a name like `file0`, ClickPipes will not ingest it because it is not lexically greater than the last ingested file. @@ -177,8 +175,8 @@ Currently only protected buckets are supported for Azure Blob Storage. Authentic - **Does ClickPipes support GCS buckets prefixed with `gs://`?** -No. For interoperability reasons we ask you to replace your `gs://` bucket prefix with `https://storage.googleapis.com/`. + No. For interoperability reasons we ask you to replace your `gs://` bucket prefix with `https://storage.googleapis.com/`. - **What permissions does a GCS public bucket require?** -`allUsers` requires appropriate role assignment. The `roles/storage.objectViewer` role must be granted at the bucket level. This role provides the `storage.objects.list` permission, which allows ClickPipes to list all objects in the bucket which is required for onboarding and ingestion. This role also includes the `storage.objects.get` permission, which is required to read or download individual objects in the bucket. See: [Google Cloud Access Control](https://cloud.google.com/storage/docs/access-control/iam-roles) for further information. + `allUsers` requires appropriate role assignment. The `roles/storage.objectViewer` role must be granted at the bucket level. This role provides the `storage.objects.list` permission, which allows ClickPipes to list all objects in the bucket which is required for onboarding and ingestion. This role also includes the `storage.objects.get` permission, which is required to read or download individual objects in the bucket. See: [Google Cloud Access Control](https://cloud.google.com/storage/docs/access-control/iam-roles) for further information. diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/add_table.md b/docs/integrations/data-ingestion/clickpipes/postgres/add_table.md index 9de4ca503ce..6b5c6a0b419 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/add_table.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/add_table.md @@ -20,8 +20,7 @@ This can be done by the following steps: 2. Click on Edit Table settings. 3. Locate your table - this can be done by searching it in the search bar. 4. Select the table by clicking on the checkbox. -
- + 5. Click update. 6. Upon successful update, the pipe will have statuses `Setup`, `Snapshot` and `Running` in that order. The table's initial load can be tracked in the **Tables** tab. diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/deduplication.md b/docs/integrations/data-ingestion/clickpipes/postgres/deduplication.md index a46c74122fa..dc488f2e7d3 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/deduplication.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/deduplication.md @@ -74,51 +74,51 @@ _Take note of the WHERE clause in the following queries, used to filter out dele - **Simple count query**: Count the number of posts. -This is the simplest query you can run to check if the synchronization went fine. The two queries should return the same count. + This is the simplest query you can run to check if the synchronization went fine. The two queries should return the same count. -```sql --- PostgreSQL -SELECT count(*) FROM posts; + ```sql + -- PostgreSQL + SELECT count(*) FROM posts; --- ClickHouse -SELECT count(*) FROM posts FINAL WHERE _peerdb_is_deleted=0; -``` + -- ClickHouse + SELECT count(*) FROM posts FINAL WHERE _peerdb_is_deleted=0; + ``` -- **Simple aggregation with JOIN**: Top 10 users who have accumulated the most views. +- **Simple aggregation with JOIN**: Top 10 users who have accumulated the most views. -An example of an aggregation on a single table. Having duplicates here would greatly affect the result of the sum function. + An example of an aggregation on a single table. Having duplicates here would greatly affect the result of the sum function. -```sql --- PostgreSQL -SELECT + ```sql + -- PostgreSQL + SELECT sum(p.viewcount) AS viewcount, p.owneruserid AS user_id, u.displayname AS display_name -FROM posts p -LEFT JOIN users u ON u.id = p.owneruserid --- highlight-next-line -WHERE p.owneruserid > 0 -GROUP BY user_id, display_name -ORDER BY viewcount DESC -LIMIT 10; - --- ClickHouse -SELECT + FROM posts p + LEFT JOIN users u ON u.id = p.owneruserid + -- highlight-next-line + WHERE p.owneruserid > 0 + GROUP BY user_id, display_name + ORDER BY viewcount DESC + LIMIT 10; + + -- ClickHouse + SELECT sum(p.viewcount) AS viewcount, p.owneruserid AS user_id, u.displayname AS display_name -FROM posts AS p -FINAL -LEFT JOIN users AS u -FINAL ON (u.id = p.owneruserid) AND (u._peerdb_is_deleted = 0) --- highlight-next-line -WHERE (p.owneruserid > 0) AND (p._peerdb_is_deleted = 0) -GROUP BY + FROM posts AS p + FINAL + LEFT JOIN users AS u + FINAL ON (u.id = p.owneruserid) AND (u._peerdb_is_deleted = 0) + -- highlight-next-line + WHERE (p.owneruserid > 0) AND (p._peerdb_is_deleted = 0) + GROUP BY user_id, display_name -ORDER BY viewcount DESC -LIMIT 10 -``` + ORDER BY viewcount DESC + LIMIT 10 + ``` #### FINAL setting {#final-setting} @@ -132,7 +132,7 @@ SELECT count(*) FROM posts SETTINGS FINAL = 1; -- Set FINAL for the session SET final = 1; -SELECT count(*) FROM posts; +SELECT count(*) FROM posts; ``` #### ROW policy {#row-policy} @@ -148,7 +148,7 @@ CREATE ROW POLICY cdc_policy ON votes FOR SELECT USING _peerdb_is_deleted = 0 TO ### Query like with Postgres {#query-like-with-postgres} -Migrating an analytical dataset from PostgreSQL to ClickHouse often requires modifying application queries to account for differences in data handling and query execution. +Migrating an analytical dataset from PostgreSQL to ClickHouse often requires modifying application queries to account for differences in data handling and query execution. This section will explore techniques for deduplicating data while keeping the original queries unchanged. @@ -165,7 +165,7 @@ CREATE VIEW votes_view AS SELECT * FROM votes FINAL WHERE _peerdb_is_deleted=0; CREATE VIEW comments_view AS SELECT * FROM comments FINAL WHERE _peerdb_is_deleted=0; ``` -Then, we can query the views using the same query we would use in PostgreSQL. +Then, we can query the views using the same query we would use in PostgreSQL. ```sql -- Most viewed posts @@ -188,12 +188,12 @@ The key advantage of this method is that the query using the FINAL keyword runs However, a drawback is that the data in the destination table is only as up-to-date as the most recent refresh. That said, for many use cases, refresh intervals ranging from several minutes to a few hours may be sufficient. ```sql --- Create deduplicated posts table +-- Create deduplicated posts table CREATE TABLE deduplicated_posts AS posts; -- Create the Materialized view and schedule to run every hour -CREATE MATERIALIZED VIEW deduplicated_posts_mv REFRESH EVERY 1 HOUR TO deduplicated_posts AS -SELECT * FROM posts FINAL WHERE _peerdb_is_deleted=0 +CREATE MATERIALIZED VIEW deduplicated_posts_mv REFRESH EVERY 1 HOUR TO deduplicated_posts AS +SELECT * FROM posts FINAL WHERE _peerdb_is_deleted=0 ``` Then, you can query the table `deduplicated_posts` normally. diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/faq.md b/docs/integrations/data-ingestion/clickpipes/postgres/faq.md index 279b5410929..4ed8611d31b 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/faq.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/faq.md @@ -35,17 +35,17 @@ Yes, partitioned tables are supported out of the box, as long as they have a PRI Yes! ClickPipes for Postgres offers two ways to connect to databases in private networks: 1. **SSH Tunneling** - - Works well for most use cases - - See the setup instructions [here](/integrations/clickpipes/postgres#adding-your-source-postgres-database-connection) - - Works across all regions + - Works well for most use cases + - See the setup instructions [here](/integrations/clickpipes/postgres#adding-your-source-postgres-database-connection) + - Works across all regions 2. **AWS PrivateLink** - - Available in three AWS regions: + - Available in three AWS regions: - us-east-1 - - us-east-2 + - us-east-2 - eu-central-1 - - For detailed setup instructions, see our [PrivateLink documentation](/knowledgebase/aws-privatelink-setup-for-clickpipes) - - For regions where PrivateLink is not available, please use SSH tunneling + - For detailed setup instructions, see our [PrivateLink documentation](/knowledgebase/aws-privatelink-setup-for-clickpipes) + - For regions where PrivateLink is not available, please use SSH tunneling ### How do you handle UPDATEs and DELETEs? {#how-do-you-handle-updates-and-deletes} @@ -70,41 +70,41 @@ For detailed pricing information, please refer to the [ClickPipes for Postgres C If you're noticing that the size of your Postgres replication slot keeps increasing or isn't coming back down, it usually means that **WAL (Write-Ahead Log) records aren't being consumed (or "replayed") quickly enough** by your CDC pipeline or replication process. Below are the most common causes and how you can address them. -1. **Sudden Spikes in Database Activity** - - Large batch updates, bulk inserts, or significant schema changes can quickly generate a lot of WAL data. - - The replication slot will hold these WAL records until they are consumed, causing a temporary spike in size. +1. **Sudden Spikes in Database Activity** + - Large batch updates, bulk inserts, or significant schema changes can quickly generate a lot of WAL data. + - The replication slot will hold these WAL records until they are consumed, causing a temporary spike in size. -2. **Long-Running Transactions** - - An open transaction forces Postgres to keep all WAL segments generated since the transaction began, which can dramatically increase slot size. - - Set `statement_timeout` and `idle_in_transaction_session_timeout` to reasonable values to prevent transactions from staying open indefinitely: +2. **Long-Running Transactions** + - An open transaction forces Postgres to keep all WAL segments generated since the transaction began, which can dramatically increase slot size. + - Set `statement_timeout` and `idle_in_transaction_session_timeout` to reasonable values to prevent transactions from staying open indefinitely: ```sql - SELECT + SELECT pid, state, age(now(), xact_start) AS transaction_duration, query AS current_query - FROM + FROM pg_stat_activity - WHERE + WHERE xact_start IS NOT NULL - ORDER BY + ORDER BY age(now(), xact_start) DESC; ``` Use this query to identify unusually long-running transactions. -3. **Maintenance or Utility Operations (e.g., `pg_repack`)** - - Tools like `pg_repack` can rewrite entire tables, generating large amounts of WAL data in a short time. - - Schedule these operations during slower traffic periods or monitor your WAL usage closely while they run. +3. **Maintenance or Utility Operations (e.g., `pg_repack`)** + - Tools like `pg_repack` can rewrite entire tables, generating large amounts of WAL data in a short time. + - Schedule these operations during slower traffic periods or monitor your WAL usage closely while they run. -4. **VACUUM and VACUUM ANALYZE** - - Although necessary for database health, these operations can create extra WAL traffic—especially if they scan large tables. - - Consider using autovacuum tuning parameters or scheduling manual VACUUM operations during off-peak hours. +4. **VACUUM and VACUUM ANALYZE** + - Although necessary for database health, these operations can create extra WAL traffic—especially if they scan large tables. + - Consider using autovacuum tuning parameters or scheduling manual VACUUM operations during off-peak hours. -5. **Replication Consumer Not Actively Reading the Slot** - - If your CDC pipeline (e.g., ClickPipes) or another replication consumer stops, pauses, or crashes, WAL data will accumulate in the slot. - - Ensure your pipeline is continuously running and check logs for connectivity or authentication errors. +5. **Replication Consumer Not Actively Reading the Slot** + - If your CDC pipeline (e.g., ClickPipes) or another replication consumer stops, pauses, or crashes, WAL data will accumulate in the slot. + - Ensure your pipeline is continuously running and check logs for connectivity or authentication errors. -For an excellent deep dive into this topic, check out our blog post: [Overcoming Pitfalls of Postgres Logical Decoding](https://blog.peerdb.io/overcoming-pitfalls-of-postgres-logical-decoding#heading-beware-of-replication-slot-growth-how-to-monitor-it). + For an excellent deep dive into this topic, check out our blog post: [Overcoming Pitfalls of Postgres Logical Decoding](https://blog.peerdb.io/overcoming-pitfalls-of-postgres-logical-decoding#heading-beware-of-replication-slot-growth-how-to-monitor-it). ### How are Postgres data types mapped to ClickHouse? {#how-are-postgres-data-types-mapped-to-clickhouse} @@ -122,12 +122,12 @@ JSON and JSONB columns are replicated as String type in ClickHouse. Since ClickH When you pause the mirror, the messages are queued up in the replication slot on the source Postgres, ensuring they are buffered and not lost. However, pausing and resuming the mirror will re-establish the connection, which could take some time depending on the source. -During this process, both the sync (pulling data from Postgres and streaming it into the ClickHouse raw table) and normalize (from raw table to target table) operations are aborted. However, they retain the state required to resume durably. +During this process, both the sync (pulling data from Postgres and streaming it into the ClickHouse raw table) and normalize (from raw table to target table) operations are aborted. However, they retain the state required to resume durably. - For sync, if it is canceled mid-way, the confirmed_flush_lsn in Postgres is not advanced, so the next sync will start from the same position as the aborted one, ensuring data consistency. - For normalize, the ReplacingMergeTree insert order handles deduplication. -In summary, while sync and normalize processes are terminated during a pause, it is safe to do so as they can resume without data loss or inconsistency. + In summary, while sync and normalize processes are terminated during a pause, it is safe to do so as they can resume without data loss or inconsistency. ### Can ClickPipe creation be automated or done via API or CLI? {#can-clickpipe-creation-be-automated-or-done-via-api-or-cli} @@ -143,7 +143,7 @@ For Postgres versions 13 or lower, CTID range scans are slower, and these settin 2. **Delete destination tables on ClickHouse**: Ensure that the tables created by the previous pipe are removed. 3. **Create a new pipe with optimized settings**: Typically, increase the snapshot number of rows per partition to between 1 million and 10 million, depending on your specific requirements and the load your Postgres instance can handle. -These adjustments should significantly enhance the performance of the initial load, especially for older Postgres versions. If you are using Postgres 14 or later, these settings are less impactful due to improved support for CTID range scans. + These adjustments should significantly enhance the performance of the initial load, especially for older Postgres versions. If you are using Postgres 14 or later, these settings are less impactful due to improved support for CTID range scans. ### How should I scope my publications when setting up replication? {#how-should-i-scope-my-publications-when-setting-up-replication} @@ -166,25 +166,24 @@ WHERE You have two options when dealing with tables without primary keys: 1. **Exclude tables without primary keys from ClickPipes**: - Create the publication with only the tables that have a primary key: - ```sql - CREATE PUBLICATION clickpipes_publication FOR TABLE table_with_primary_key1, table_with_primary_key2, ...; - ``` + Create the publication with only the tables that have a primary key: + ```sql + CREATE PUBLICATION clickpipes_publication FOR TABLE table_with_primary_key1, table_with_primary_key2, ...; + ``` 2. **Include tables without primary keys in ClickPipes**: - If you want to include tables without a primary key, you need to alter their replica identity to `FULL`. This ensures that UPDATE and DELETE operations work correctly: - ```sql - ALTER TABLE table_without_primary_key1 REPLICA IDENTITY FULL; - ALTER TABLE table_without_primary_key2 REPLICA IDENTITY FULL; - CREATE PUBLICATION clickpipes_publication FOR TABLE <...>, <...>; - ``` + If you want to include tables without a primary key, you need to alter their replica identity to `FULL`. This ensures that UPDATE and DELETE operations work correctly: + ```sql + ALTER TABLE table_without_primary_key1 REPLICA IDENTITY FULL; + ALTER TABLE table_without_primary_key2 REPLICA IDENTITY FULL; + CREATE PUBLICATION clickpipes_publication FOR TABLE <...>, <...>; + ``` -:::tip -If you're creating a publication manually instead of letting ClickPipes manage it, we don't recommend creating a publication `FOR ALL TABLES`, this leads to more traffic from Postgres to ClickPipes (to sending changes for other tables not in the pipe) and reduces overall efficiency. - -For manually created publications, please add any tables you want to the publication before adding them to the pipe. -::: + :::tip + If you're creating a publication manually instead of letting ClickPipes manage it, we don't recommend creating a publication `FOR ALL TABLES`, this leads to more traffic from Postgres to ClickPipes (to sending changes for other tables not in the pipe) and reduces overall efficiency. + For manually created publications, please add any tables you want to the publication before adding them to the pipe. + ::: ## Recommended `max_slot_wal_keep_size` settings {#recommended-max_slot_wal_keep_size-settings} @@ -243,7 +242,7 @@ Another reason we've observed is the presence of downstream Materialized Views w The `invalid snapshot identifier` error occurs when there is a connection drop between ClickPipes and your Postgres database. This can happen due to gateway timeouts, database restarts, or other transient issues. -It is recommended that you do not carry out any disruptive operations like upgrades or restarts on your Postgres database while Initial Load is in progress and ensure that the network connection to your database is stable. +It is recommended that you do not carry out any disruptive operations like upgrades or restarts on your Postgres database while Initial Load is in progress and ensure that the network connection to your database is stable. To resolve this issue, you can trigger a resync from the ClickPipes UI. This will restart the initial load process from the beginning. @@ -256,17 +255,17 @@ To recover your ClickPipe after dropping a publication: 1. Create a new publication with the same name and required tables in Postgres 2. Click the 'Resync tables' button in the Settings tab of your ClickPipe -This resync is necessary because the recreated publication will have a different Object Identifier (OID) in Postgres, even if it has the same name. The resync process refreshes your destination tables and restores the connection. + This resync is necessary because the recreated publication will have a different Object Identifier (OID) in Postgres, even if it has the same name. The resync process refreshes your destination tables and restores the connection. -Alternatively, you can create an entirely new pipe if preferred. + Alternatively, you can create an entirely new pipe if preferred. -Note that if you're working with partitioned tables, make sure to create your publication with the appropriate settings: + Note that if you're working with partitioned tables, make sure to create your publication with the appropriate settings: -```sql -CREATE PUBLICATION clickpipes_publication -FOR TABLE <...>, <...> -WITH (publish_via_partition_root = true); -``` + ```sql + CREATE PUBLICATION clickpipes_publication + FOR TABLE <...>, <...> + WITH (publish_via_partition_root = true); + ``` ## What if I am seeing `Unexpected Datatype` errors or `Cannot parse type XX ...` {#what-if-i-am-seeing-unexpected-datatype-errors} diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/index.md b/docs/integrations/data-ingestion/clickpipes/postgres/index.md index 85923d822ec..52bd9e4ff4e 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/index.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/index.md @@ -42,14 +42,13 @@ To get started, you first need to make sure that your Postgres database is set u 9. [TimescaleDB](./postgres/source/timescale), if you are using the TimescaleDB extension on a managed service or self-hosted instance. + :::warning -:::warning + Postgres Proxies like PgBouncer, RDS Proxy, Supabase Pooler, etc., are not supported for CDC based replication. Please make sure to NOT use them for the ClickPipes setup and instead add connection details of the actual Postgres database. -Postgres Proxies like PgBouncer, RDS Proxy, Supabase Pooler, etc., are not supported for CDC based replication. Please make sure to NOT use them for the ClickPipes setup and instead add connection details of the actual Postgres database. + ::: -::: - -Once your source Postgres database is set up, you can continue creating your ClickPipe. + Once your source Postgres database is set up, you can continue creating your ClickPipe. ## Creating your ClickPipe {#creating-your-clickpipe} @@ -58,28 +57,28 @@ Make sure you are logged in to your ClickHouse Cloud account. If you don't have [//]: # ( TODO update image here) 1. In the ClickHouse Cloud console, navigate to your ClickHouse Cloud Service. - + 2. Select the `Data Sources` button on the left-side menu and click on "Set up a ClickPipe" - + 3. Select the `Postgres CDC` tile - + ### Adding your source Postgres database connection {#adding-your-source-postgres-database-connection} 4. Fill in the connection details for your source Postgres database which you configured in the prerequisites step. - :::info + :::info - Before you start adding your connection details make sure that you have whitelisted ClickPipes IP addresses in your firewall rules. You can find the list of ClickPipes IP addresses [here](../index.md#list-of-static-ips). - For more information refer to the source Postgres setup guides linked at [the top of this page](#prerequisites). + Before you start adding your connection details make sure that you have whitelisted ClickPipes IP addresses in your firewall rules. You can find the list of ClickPipes IP addresses [here](../index.md#list-of-static-ips). + For more information refer to the source Postgres setup guides linked at [the top of this page](#prerequisites). - ::: + ::: - + #### (Optional) Setting up AWS Private Link {#optional-setting-up-aws-private-link} @@ -91,28 +90,27 @@ You can follow the [setup guide to set up the connection](/integrations/clickpip You can specify SSH tunneling details if your source Postgres database is not publicly accessible. - 1. Enable the "Use SSH Tunnelling" toggle. 2. Fill in the SSH connection details. - + 3. To use Key-based authentication, click on "Revoke and generate key pair" to generate a new key pair and copy the generated public key to your SSH server under `~/.ssh/authorized_keys`. 4. Click on "Verify Connection" to verify the connection. -:::note + :::note -Make sure to whitelist [ClickPipes IP addresses](../clickpipes#list-of-static-ips) in your firewall rules for the SSH bastion host so that ClickPipes can establish the SSH tunnel. + Make sure to whitelist [ClickPipes IP addresses](../clickpipes#list-of-static-ips) in your firewall rules for the SSH bastion host so that ClickPipes can establish the SSH tunnel. -::: + ::: -Once the connection details are filled in, click on "Next". + Once the connection details are filled in, click on "Next". ### Configuring the replication settings {#configuring-the-replication-settings} 5. Make sure to select the replication slot from the dropdown list you created in the prerequisites step. - + #### Advanced settings {#advanced-settings} @@ -124,24 +122,23 @@ You can configure the Advanced settings if needed. A brief description of each s - **Snapshot number of rows per partition**: This is the number of rows that will be fetched in each partition during the initial snapshot. This is useful when you have a large number of rows in your tables and you want to control the number of rows fetched in each partition. - **Snapshot number of tables in parallel**: This is the number of tables that will be fetched in parallel during the initial snapshot. This is useful when you have a large number of tables and you want to control the number of tables fetched in parallel. - ### Configuring the tables {#configuring-the-tables} 6. Here you can select the destination database for your ClickPipe. You can either select an existing database or create a new one. - + 7. You can select the tables you want to replicate from the source Postgres database. While selecting the tables, you can also choose to rename the tables in the destination ClickHouse database as well as exclude specific columns. - :::warning - If you are defining an ordering key in ClickHouse differently than from the primary key in Postgres, don't forget to read all the [considerations](/integrations/clickpipes/postgres/ordering_keys) around it - ::: + :::warning + If you are defining an ordering key in ClickHouse differently than from the primary key in Postgres, don't forget to read all the [considerations](/integrations/clickpipes/postgres/ordering_keys) around it + ::: ### Review permissions and start the ClickPipe {#review-permissions-and-start-the-clickpipe} 8. Select the "Full access" role from the permissions dropdown and click "Complete Setup". - + ## What's next? {#whats-next} diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/maintenance.md b/docs/integrations/data-ingestion/clickpipes/postgres/maintenance.md index 902dde2c10d..f5a23ef0ab4 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/maintenance.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/maintenance.md @@ -11,6 +11,5 @@ There is an upcoming maintenance window for Postgres ClickPipes scheduled on: - **Date:** 17 April 2025 - **Time:** 07:00 AM - 08:00 AM UTC -During this time, your Postgres Pipes will experience a brief downtime. -The ClickPipes will be available again after the maintenance window and will resume normal operations. - + During this time, your Postgres Pipes will experience a brief downtime. + The ClickPipes will be available again after the maintenance window and will resume normal operations. diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/ordering_keys.md b/docs/integrations/data-ingestion/clickpipes/postgres/ordering_keys.md index 91337e28051..d1c73ece217 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/ordering_keys.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/ordering_keys.md @@ -7,13 +7,13 @@ title: 'Ordering Keys' Ordering Keys (a.k.a. sorting keys) define how data is sorted on disk and indexed for a table in ClickHouse. When replicating from Postgres, ClickPipes sets the Postgres primary key of a table as the ordering key for the corresponding table in ClickHouse. In most cases, the Postgres primary key serves as a sufficient ordering key, as ClickHouse is already optimized for fast scans, and custom ordering keys are often not required. -As describe in the [migration guide](/migrations/postgresql/data-modeling-techniques), for larger use cases you should include additional columns beyond the Postgres primary key in the ClickHouse ordering key to optimize queries. +As describe in the [migration guide](/migrations/postgresql/data-modeling-techniques), for larger use cases you should include additional columns beyond the Postgres primary key in the ClickHouse ordering key to optimize queries. By default with CDC, choosing an ordering key different from the Postgres primary key can cause data deduplication issues in ClickHouse. This happens because the ordering key in ClickHouse serves a dual role: it controls data indexing and sorting while acting as the deduplication key. The easiest way to address this issue is by defining refreshable materialized views. ## Use refreshable materialized views {#use-refreshable-materialized-views} -A simple way to define custom ordering keys (ORDER BY) is using [refreshable materialized views](/materialized-view/refreshable-materialized-view) (MVs). These allow you to periodically (e.g., every 5 or 10 minutes) copy the entire table with the desired ordering key. +A simple way to define custom ordering keys (ORDER BY) is using [refreshable materialized views](/materialized-view/refreshable-materialized-view) (MVs). These allow you to periodically (e.g., every 5 or 10 minutes) copy the entire table with the desired ordering key. Below is an example of a Refreshable MV with a custom ORDER BY and required deduplication: @@ -22,7 +22,7 @@ CREATE MATERIALIZED VIEW posts_final REFRESH EVERY 10 second ENGINE = ReplacingMergeTree(_peerdb_version) ORDER BY (owneruserid,id) -- different ordering key but with suffixed postgres pkey AS -SELECT * FROM posts FINAL +SELECT * FROM posts FINAL WHERE _peerdb_is_deleted = 0; -- this does the deduplication ``` diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/pause_and_resume.md b/docs/integrations/data-ingestion/clickpipes/postgres/pause_and_resume.md index 1c48cc7e1d8..c1b5ce2ce20 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/pause_and_resume.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/pause_and_resume.md @@ -12,7 +12,6 @@ import pause_status from '@site/static/images/integrations/data-ingestion/clickp import resume_button from '@site/static/images/integrations/data-ingestion/clickpipes/postgres/resume_button.png' import resume_dialog from '@site/static/images/integrations/data-ingestion/clickpipes/postgres/resume_dialog.png' - There are scenarios where it would be useful to pause a Postgres ClickPipe. For example, you may want to run some analytics on existing data in a static state. Or, you might be performing upgrades on Postgres. Here is how you can pause and resume a Postgres ClickPipe. ## Steps to pause a Postgres ClickPipe {#pause-clickpipe-steps} @@ -20,33 +19,28 @@ There are scenarios where it would be useful to pause a Postgres ClickPipe. For 1. In the Data Sources tab, click on the Postgres ClickPipe you wish to pause. 2. Head over to the **Settings** tab. 3. Click on the **Pause** button. -
- + 4. A dialog box should appear for confirmation. Click on Pause again. -
- + 4. Head over to the **Metrics** tab. 5. In around 5 seconds (and also on page refresh), the status of the pipe should be **Paused**. -
- + ## Steps to resume a Postgres ClickPipe {#resume-clickpipe-steps} 1. In the Data Sources tab, click on the Postgres ClickPipe you wish to resume. The status of the mirror should be **Paused** initially. 2. Head over to the **Settings** tab. 3. Click on the **Resume** button. -
- + 4. A dialog box should appear for confirmation. Click on Resume again. -
- + 5. Head over to the **Metrics** tab. 6. In around 5 seconds (and also on page refresh), the status of the pipe should be **Running**. diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/remove_table.md b/docs/integrations/data-ingestion/clickpipes/postgres/remove_table.md index ea21858d47d..90c36911375 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/remove_table.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/remove_table.md @@ -18,9 +18,8 @@ The first step is to remove the table from the pipe. This can be done by the fol 2. Click on Edit Table Settings. 3. Locate your table - this can be done by searching it in the search bar. 4. Deselect the table by clicking on the selected checkbox. -
- + 5. Click update. 6. Upon successful update, in the **Metrics** tab the status will be **Running**. This table will no longer be replicated by this ClickPipe. diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/source/aurora.md b/docs/integrations/data-ingestion/clickpipes/postgres/source/aurora.md index 8a569f4f0a2..5460c067ea6 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/source/aurora.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/source/aurora.md @@ -26,41 +26,41 @@ You can skip this section if your Aurora instance already has the following sett - `rds.logical_replication = 1` - `wal_sender_timeout = 0` -These settings are typically pre-configured if you previously used another data replication tool. - -```text -postgres=> SHOW rds.logical_replication ; - rds.logical_replication -------------------------- - on -(1 row) - -postgres=> SHOW wal_sender_timeout ; - wal_sender_timeout --------------------- - 0 -(1 row) -``` + These settings are typically pre-configured if you previously used another data replication tool. + + ```text + postgres=> SHOW rds.logical_replication ; + rds.logical_replication + ------------------------- + on + (1 row) + + postgres=> SHOW wal_sender_timeout ; + wal_sender_timeout + -------------------- + 0 + (1 row) + ``` -If not already configured, follow these steps: + If not already configured, follow these steps: 1. Create a new parameter group for your Aurora PostgreSQL version with the required settings: - Set `rds.logical_replication` to 1 - Set `wal_sender_timeout` to 0 - + - + - + 2. Apply the new parameter group to your Aurora PostgreSQL cluster - + 3. Reboot your Aurora cluster to apply the changes - + ## Configure database user {#configure-database-user} @@ -92,7 +92,6 @@ Connect to your Aurora PostgreSQL writer instance as an admin user and execute t CREATE PUBLICATION clickpipes_publication FOR ALL TABLES; ``` - ## Configure network access {#configure-network-access} ### IP-based access control {#ip-based-access-control} diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/source/azure-flexible-server-postgres.md b/docs/integrations/data-ingestion/clickpipes/postgres/source/azure-flexible-server-postgres.md index 391438a9112..43e6a7327fe 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/source/azure-flexible-server-postgres.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/source/azure-flexible-server-postgres.md @@ -21,15 +21,15 @@ ClickPipes supports Postgres version 12 and later. 1. Click on the **Server parameters** section - + 2. Edit the `wal_level` to `logical` - + 3. This change would require a server restart. So restart when requested. - + ## Creating ClickPipes users and granting permissions {#creating-clickpipes-user-and-granting-permissions} @@ -37,46 +37,44 @@ Connect to your Azure Flexible Server Postgres through the admin user and run th 1. Create a Postgres user for exclusively ClickPipes. - ```sql - CREATE USER clickpipes_user PASSWORD 'some-password'; - ``` + ```sql + CREATE USER clickpipes_user PASSWORD 'some-password'; + ``` 2. Provide read-only access to the schema from which you are replicating tables to the `clickpipes_user`. Below example shows setting up permissions for the `public` schema. If you want to grant access to multiple schemas, you can run these three commands for each schema. - ```sql - GRANT USAGE ON SCHEMA "public" TO clickpipes_user; - GRANT SELECT ON ALL TABLES IN SCHEMA "public" TO clickpipes_user; - ALTER DEFAULT PRIVILEGES IN SCHEMA "public" GRANT SELECT ON TABLES TO clickpipes_user; - ``` + ```sql + GRANT USAGE ON SCHEMA "public" TO clickpipes_user; + GRANT SELECT ON ALL TABLES IN SCHEMA "public" TO clickpipes_user; + ALTER DEFAULT PRIVILEGES IN SCHEMA "public" GRANT SELECT ON TABLES TO clickpipes_user; + ``` 3. Grant replication access to this user: - ```sql - ALTER ROLE clickpipes_user REPLICATION; - ``` + ```sql + ALTER ROLE clickpipes_user REPLICATION; + ``` 4. Create publication that you'll be using for creating the MIRROR (replication) in future. - ```sql - CREATE PUBLICATION clickpipes_publication FOR ALL TABLES; - ``` + ```sql + CREATE PUBLICATION clickpipes_publication FOR ALL TABLES; + ``` 5. Set `wal_sender_timeout` to 0 for `clickpipes_user` - ```sql - ALTER ROLE clickpipes_user SET wal_sender_timeout to 0; - ``` - + ```sql + ALTER ROLE clickpipes_user SET wal_sender_timeout to 0; + ``` ## Add ClickPipes IPs to Firewall {#add-clickpipes-ips-to-firewall} Please follow the below steps to add [ClickPipes IPs](../../index.md#list-of-static-ips) to your network. 1. Go to the **Networking** tab and add the [ClickPipes IPs](../../index.md#list-of-static-ips) to the Firewall - of your Azure Flexible Server Postgres OR the Jump Server/Bastion if you are using SSH tunneling. - - + of your Azure Flexible Server Postgres OR the Jump Server/Bastion if you are using SSH tunneling. + ## What's next? {#whats-next} diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/source/crunchy-postgres.md b/docs/integrations/data-ingestion/clickpipes/postgres/source/crunchy-postgres.md index 865690d1ffb..aa5cbf6e155 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/source/crunchy-postgres.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/source/crunchy-postgres.md @@ -11,7 +11,6 @@ import Image from '@theme/IdealImage'; # Crunchy Bridge Postgres source setup guide - ClickPipes supports Postgres version 12 and later. ## Enable logical replication {#enable-logical-replication} @@ -62,8 +61,6 @@ Safelist [ClickPipes IPs](../../index.md#list-of-static-ips) by adding the Firew - - ## What's next? {#whats-next} You can now [create your ClickPipe](../index.md) and start ingesting data from your Postgres instance into ClickHouse Cloud. diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/source/generic.md b/docs/integrations/data-ingestion/clickpipes/postgres/source/generic.md index 22cf9cd6c88..c9b98d4228e 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/source/generic.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/source/generic.md @@ -13,7 +13,6 @@ If you use one of the supported providers (in the sidebar), please refer to the ::: - ClickPipes supports Postgres version 12 and later. ## Enable logical replication {#enable-logical-replication} @@ -23,12 +22,12 @@ ClickPipes supports Postgres version 12 and later. ```sql wal_level = logical ``` - To check the same, you can run the following SQL command: + To check the same, you can run the following SQL command: ```sql SHOW wal_level; ``` - The output should be `logical`. If not, run: + The output should be `logical`. If not, run: ```sql ALTER SYSTEM SET wal_level = logical; ``` @@ -38,20 +37,19 @@ ClickPipes supports Postgres version 12 and later. max_wal_senders > 1 max_replication_slots >= 4 ``` - To check the same, you can run the following SQL commands: + To check the same, you can run the following SQL commands: ```sql SHOW max_wal_senders; SHOW max_replication_slots; ``` - If the values do not match the recommended values, you can run the following SQL commands to set them: + If the values do not match the recommended values, you can run the following SQL commands to set them: ```sql ALTER SYSTEM SET max_wal_senders = 10; ALTER SYSTEM SET max_replication_slots = 10; ``` 3. If you have made any changes to the configuration as mentioned above, you NEED to RESTART the Postgres instance for the changes to take effect. - ## Creating a user with permissions and publication {#creating-a-user-with-permissions-and-publication} Let's create a new user for ClickPipes with the necessary permissions suitable for CDC, @@ -76,7 +74,6 @@ Make sure to replace `clickpipes_user` and `clickpipes_password` with your desir ::: - ## Enabling connections in pg_hba.conf to the ClickPipes User {#enabling-connections-in-pg_hbaconf-to-the-clickpipes-user} If you are self serving, you need to allow connections to the ClickPipes user from the ClickPipes IP addresses by following the below steps. If you are using a managed service, you can do the same by following the provider's documentation. @@ -91,7 +88,6 @@ If you are self serving, you need to allow connections to the ClickPipes user fr SELECT pg_reload_conf(); ``` - ## Increase `max_slot_wal_keep_size` {#increase-max_slot_wal_keep_size} This is a recommended configuration change to ensure that large transactions/commits do not cause the replication slot to be dropped. diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/source/google-cloudsql.md b/docs/integrations/data-ingestion/clickpipes/postgres/source/google-cloudsql.md index 2f4cca66aaa..db8b5646516 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/source/google-cloudsql.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/source/google-cloudsql.md @@ -23,7 +23,6 @@ If you use one of the supported providers (in the sidebar), please refer to the ::: - ## Supported Postgres versions {#supported-postgres-versions} Anything on or after Postgres 12 @@ -34,14 +33,13 @@ Anything on or after Postgres 12 1. Click on **Edit** button on the Overview page. - + 2. Go to Flags and change `cloudsql.logical_decoding` to on and `wal_sender_timeout` to 0. These changes will need restarting your Postgres server. - - - - + + + ## Creating ClickPipes user and granting permissions {#creating-clickpipes-user-and-granting-permissions} @@ -49,32 +47,31 @@ Connect to your Cloud SQL Postgres through the admin user and run the below comm 1. Create a Postgres user for exclusively ClickPipes. - ```sql - CREATE USER clickpipes_user PASSWORD 'some-password'; - ``` + ```sql + CREATE USER clickpipes_user PASSWORD 'some-password'; + ``` 2. Provide read-only access to the schema from which you are replicating tables to the `clickpipes_user`. Below example shows setting up permissions for the `public` schema. If you want to grant access to multiple schemas, you can run these three commands for each schema. - ```sql - GRANT USAGE ON SCHEMA "public" TO clickpipes_user; - GRANT SELECT ON ALL TABLES IN SCHEMA "public" TO clickpipes_user; - ALTER DEFAULT PRIVILEGES IN SCHEMA "public" GRANT SELECT ON TABLES TO clickpipes_user; - ``` + ```sql + GRANT USAGE ON SCHEMA "public" TO clickpipes_user; + GRANT SELECT ON ALL TABLES IN SCHEMA "public" TO clickpipes_user; + ALTER DEFAULT PRIVILEGES IN SCHEMA "public" GRANT SELECT ON TABLES TO clickpipes_user; + ``` 3. Grant replication access to this user: - ```sql - ALTER ROLE clickpipes_user REPLICATION; - ``` + ```sql + ALTER ROLE clickpipes_user REPLICATION; + ``` 4. Create publication that you'll be using for creating the MIRROR (replication) in future. - ```sql - CREATE PUBLICATION clickpipes_publication FOR ALL TABLES; - ``` - -[//]: # (TODO Add SSH Tunneling) + ```sql + CREATE PUBLICATION clickpipes_publication FOR ALL TABLES; + ``` + [//]: # (TODO Add SSH Tunneling) ## Add ClickPipes IPs to Firewall {#add-clickpipes-ips-to-firewall} @@ -88,17 +85,16 @@ If your are using SSH Tunneling, then you need to add the [ClickPipes IPs](../.. 1. Go to **Connections** section - + 2. Go to the Networking subsection - + 3. Add the [public IPs of ClickPipes](../../index.md#list-of-static-ips) - - - + + ## What's next? {#whats-next} diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/source/rds.md b/docs/integrations/data-ingestion/clickpipes/postgres/source/rds.md index 573d4a4a9c3..ef332686b1a 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/source/rds.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/source/rds.md @@ -26,41 +26,41 @@ You can skip this section if your RDS instance already has the following setting - `rds.logical_replication = 1` - `wal_sender_timeout = 0` -These settings are typically pre-configured if you previously used another data replication tool. - -```text -postgres=> SHOW rds.logical_replication ; - rds.logical_replication -------------------------- - on -(1 row) - -postgres=> SHOW wal_sender_timeout ; - wal_sender_timeout --------------------- - 0 -(1 row) -``` + These settings are typically pre-configured if you previously used another data replication tool. + + ```text + postgres=> SHOW rds.logical_replication ; + rds.logical_replication + ------------------------- + on + (1 row) + + postgres=> SHOW wal_sender_timeout ; + wal_sender_timeout + -------------------- + 0 + (1 row) + ``` -If not already configured, follow these steps: + If not already configured, follow these steps: 1. Create a new parameter group for your Postgres version with the required settings: - Set `rds.logical_replication` to 1 - Set `wal_sender_timeout` to 0 - + - + - + 2. Apply the new parameter group to your RDS Postgres database - + 3. Reboot your RDS instance to apply the changes - + ## Configure database user {#configure-database-user} @@ -92,7 +92,6 @@ Connect to your RDS Postgres instance as an admin user and execute the following CREATE PUBLICATION clickpipes_publication FOR ALL TABLES; ``` - ## Configure network access {#configure-network-access} ### IP-based access control {#ip-based-access-control} diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/source/supabase.md b/docs/integrations/data-ingestion/clickpipes/postgres/source/supabase.md index 7890d0e87cc..cb750c42544 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/source/supabase.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/source/supabase.md @@ -19,7 +19,6 @@ ClickPipes supports Supabase via IPv6 natively for seamless replication. ::: - ## Creating a user with permissions and replication slot {#creating-a-user-with-permissions-and-replication-slot} Let's create a new user for ClickPipes with the necessary permissions suitable for CDC, @@ -42,7 +41,6 @@ Here, we can run the following SQL commands: - Click on **Run** to have a publication and a user ready. :::note @@ -53,10 +51,8 @@ Also, remember to use the same publication name when creating the mirror in Clic ::: - ## Increase `max_slot_wal_keep_size` {#increase-max_slot_wal_keep_size} - :::warning This step will restart your Supabase database and may cause a brief downtime. @@ -81,7 +77,6 @@ The connection pooler is not supported for CDC based replication, hence it needs ::: - ## What's next? {#whats-next} You can now [create your ClickPipe](../index.md) and start ingesting data from your Postgres instance into ClickHouse Cloud. diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/source/timescale.md b/docs/integrations/data-ingestion/clickpipes/postgres/source/timescale.md index 3c94bdb774f..d8ea7828462 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/source/timescale.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/source/timescale.md @@ -14,22 +14,22 @@ import BetaBadge from '@theme/badges/BetaBadge'; ## Background {#background} -[TimescaleDB](https://github.com/timescale/timescaledb) is an open-source Postgres extension developed by Timescale Inc -that aims to boost the performance of analytics queries without having to move away from Postgres. This is achieved by -creating "hypertables" which are managed by the extension and support automatic partitioning into "chunks". +[TimescaleDB](https://github.com/timescale/timescaledb) is an open-source Postgres extension developed by Timescale Inc +that aims to boost the performance of analytics queries without having to move away from Postgres. This is achieved by +creating "hypertables" which are managed by the extension and support automatic partitioning into "chunks". Hypertables also support transparent compression and hybrid row-columnar storage (known as "hypercore"), although these features require a version of the extension that has a proprietary license. -Timescale Inc also offers two managed services for TimescaleDB: +Timescale Inc also offers two managed services for TimescaleDB: - `Managed Service for Timescale` -- `Timescale Cloud`. +- `Timescale Cloud`. -There are third-party vendors offering managed services that allow you to use the TimescaleDB extension, but due to - licensing, these vendors only support the open-source version of the extension. + There are third-party vendors offering managed services that allow you to use the TimescaleDB extension, but due to + licensing, these vendors only support the open-source version of the extension. -Timescale hypertables behave differently from regular Postgres tables in several ways. This poses some complications -to the process of replicating them, which is why the ability to replicate Timescale hypertables should be considered as -**best effort**. + Timescale hypertables behave differently from regular Postgres tables in several ways. This poses some complications + to the process of replicating them, which is why the ability to replicate Timescale hypertables should be considered as + **best effort**. ## Supported Postgres versions {#supported-postgres-versions} @@ -37,23 +37,23 @@ ClickPipes supports Postgres version 12 and later. ## Enable logical replication {#enable-logical-replication} -The steps to be follow depend on how your Postgres instance with TimescaleDB is deployed. +The steps to be follow depend on how your Postgres instance with TimescaleDB is deployed. - If you're using a managed service and your provider is listed in the sidebar, please follow the guide for that provider. -- If you're deploying TimescaleDB yourself, follow the generic guide. +- If you're deploying TimescaleDB yourself, follow the generic guide. -For other managed services, please raise a support ticket with your provider to help in enabling logical replication if -it isn't already. + For other managed services, please raise a support ticket with your provider to help in enabling logical replication if + it isn't already. -:::info -Timescale Cloud does not support enabling logical replication, which is needed for Postgres pipes in CDC mode. -As a result, users of Timescale Cloud can only perform a one-time load of their data (`Initial Load Only`) with the -Postgres ClickPipe. -::: + :::info + Timescale Cloud does not support enabling logical replication, which is needed for Postgres pipes in CDC mode. + As a result, users of Timescale Cloud can only perform a one-time load of their data (`Initial Load Only`) with the + Postgres ClickPipe. + ::: ## Configuration {#configuration} -Timescale hypertables don't store any data inserted into them. Instead, the data is stored in multiple corresponding +Timescale hypertables don't store any data inserted into them. Instead, the data is stored in multiple corresponding "chunk" tables which are in the `_timescaledb_internal` schema. For running queries on the hypertables, this is not an issue. But during logical replication, instead of detecting changes in the hypertable we detect them in the chunk table instead. The Postgres ClickPipe has logic to automatically remap changes from the chunk tables to the parent hypertable, @@ -65,45 +65,45 @@ If you'd like to only perform a one-time load of your data (`Initial Load Only`) 1. Create a Postgres user for the pipe and grant it permissions to `SELECT` the tables you wish to replicate. -```sql - CREATE USER clickpipes_user PASSWORD 'clickpipes_password'; - GRANT USAGE ON SCHEMA "public" TO clickpipes_user; - -- If desired, you can refine these GRANTs to individual tables alone, instead of the entire schema - -- But when adding new tables to the ClickPipe, you'll need to add them to the user as well. - GRANT SELECT ON ALL TABLES IN SCHEMA "public" TO clickpipes_user; - ALTER DEFAULT PRIVILEGES IN SCHEMA "public" GRANT SELECT ON TABLES TO clickpipes_user; -``` - -:::note -Make sure to replace `clickpipes_user` and `clickpipes_password` with your desired username and password. -::: - -2. As a Postgres superuser/admin user, create a publication on the source instance that has the tables and hypertables - you want to replicate and **also includes the entire `_timescaledb_internal` schema**. While creating the ClickPipe, you need to select this publication. - -```sql --- When adding new tables to the ClickPipe, you'll need to add them to the publication as well manually. - CREATE PUBLICATION clickpipes_publication FOR TABLE <...>, TABLES IN SCHEMA _timescaledb_internal; -``` - -:::tip -We don't recommend creating a publication `FOR ALL TABLES`, this leads to more traffic from Postgres to ClickPipes (to sending changes for other tables not in the pipe) and reduces overall efficiency. -::: - -:::info -Some managed services don't give their admin users the required permissions to create a publication for an entire schema. -If this is the case, please raise a support ticket with your provider. Alternatively, you can skip this step and the following -steps and perform a one-time load of your data. -::: + ```sql + CREATE USER clickpipes_user PASSWORD 'clickpipes_password'; + GRANT USAGE ON SCHEMA "public" TO clickpipes_user; + -- If desired, you can refine these GRANTs to individual tables alone, instead of the entire schema + -- But when adding new tables to the ClickPipe, you'll need to add them to the user as well. + GRANT SELECT ON ALL TABLES IN SCHEMA "public" TO clickpipes_user; + ALTER DEFAULT PRIVILEGES IN SCHEMA "public" GRANT SELECT ON TABLES TO clickpipes_user; + ``` + + :::note + Make sure to replace `clickpipes_user` and `clickpipes_password` with your desired username and password. + ::: + +2. As a Postgres superuser/admin user, create a publication on the source instance that has the tables and hypertables + you want to replicate and **also includes the entire `_timescaledb_internal` schema**. While creating the ClickPipe, you need to select this publication. + + ```sql + -- When adding new tables to the ClickPipe, you'll need to add them to the publication as well manually. + CREATE PUBLICATION clickpipes_publication FOR TABLE <...>, TABLES IN SCHEMA _timescaledb_internal; + ``` + + :::tip + We don't recommend creating a publication `FOR ALL TABLES`, this leads to more traffic from Postgres to ClickPipes (to sending changes for other tables not in the pipe) and reduces overall efficiency. + ::: + + :::info + Some managed services don't give their admin users the required permissions to create a publication for an entire schema. + If this is the case, please raise a support ticket with your provider. Alternatively, you can skip this step and the following + steps and perform a one-time load of your data. + ::: 3. Grant replication permissions to the user created earlier. -```sql --- Give replication permission to the USER - ALTER USER clickpipes_user REPLICATION; -``` + ```sql + -- Give replication permission to the USER + ALTER USER clickpipes_user REPLICATION; + ``` -After these steps, you should be able to proceed with [creating a ClickPipe](../index.md). + After these steps, you should be able to proceed with [creating a ClickPipe](../index.md). ## Troubleshooting {#troubleshooting} @@ -118,5 +118,5 @@ You may need to disable [compression](https://docs.timescale.com/api/latest/comp ## Configure network access {#configure-network-access} If you want to restrict traffic to your Timescale instance, please allowlist the [documented static NAT IPs](../../index.md#list-of-static-ips). -Instructions to do this will vary across providers, please consult the sidebar if your provider is listed or raise a +Instructions to do this will vary across providers, please consult the sidebar if your provider is listed or raise a ticket with them. diff --git a/docs/integrations/data-ingestion/clickpipes/postgres/toast.md b/docs/integrations/data-ingestion/clickpipes/postgres/toast.md index bf3fd659f2e..9792bb25091 100644 --- a/docs/integrations/data-ingestion/clickpipes/postgres/toast.md +++ b/docs/integrations/data-ingestion/clickpipes/postgres/toast.md @@ -51,12 +51,12 @@ If `REPLICA IDENTITY FULL` is not set for a table with TOAST columns, you may en 1. For INSERT operations, all columns (including TOAST columns) will be replicated correctly. 2. For UPDATE operations: - - If a TOAST column is not modified, its value will appear as NULL or empty in ClickHouse. - - If a TOAST column is modified, it will be replicated correctly. + - If a TOAST column is not modified, its value will appear as NULL or empty in ClickHouse. + - If a TOAST column is modified, it will be replicated correctly. 3. For DELETE operations, TOAST column values will appear as NULL or empty in ClickHouse. -These behaviors can lead to data inconsistencies between your PostgreSQL source and ClickHouse destination. Therefore, it's crucial to set `REPLICA IDENTITY FULL` for tables with TOAST columns to ensure accurate and complete data replication. + These behaviors can lead to data inconsistencies between your PostgreSQL source and ClickHouse destination. Therefore, it's crucial to set `REPLICA IDENTITY FULL` for tables with TOAST columns to ensure accurate and complete data replication. ## Conclusion {#conclusion} diff --git a/docs/integrations/data-ingestion/data-formats/arrow-avro-orc.md b/docs/integrations/data-ingestion/data-formats/arrow-avro-orc.md index 58968b30633..b34baaeef83 100644 --- a/docs/integrations/data-ingestion/data-formats/arrow-avro-orc.md +++ b/docs/integrations/data-ingestion/data-formats/arrow-avro-orc.md @@ -164,4 +164,4 @@ ClickHouse introduces support for many formats, both text, and binary, to cover - [Native and binary formats](binary.md) - [SQL formats](sql.md) -And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without the need for Clickhouse server. + And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without the need for Clickhouse server. diff --git a/docs/integrations/data-ingestion/data-formats/binary.md b/docs/integrations/data-ingestion/data-formats/binary.md index 254948a18fd..e2e34cd4bf2 100644 --- a/docs/integrations/data-ingestion/data-formats/binary.md +++ b/docs/integrations/data-ingestion/data-formats/binary.md @@ -237,4 +237,4 @@ ClickHouse introduces support for many formats, both text, and binary, to cover - **Native and binary formats** - [SQL formats](sql.md) -And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without starting ClickHouse server. + And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without starting ClickHouse server. diff --git a/docs/integrations/data-ingestion/data-formats/csv-tsv.md b/docs/integrations/data-ingestion/data-formats/csv-tsv.md index 4602df0d703..e95802feba4 100644 --- a/docs/integrations/data-ingestion/data-formats/csv-tsv.md +++ b/docs/integrations/data-ingestion/data-formats/csv-tsv.md @@ -9,7 +9,6 @@ description: 'Page describing how to work with CSV and TSV data in ClickHouse' ClickHouse supports importing data from and exporting to CSV. Since CSV files can come with different format specifics, including header rows, custom delimiters, and escape symbols, ClickHouse provides formats and settings to address each case efficiently. - ## Importing data from a CSV file {#importing-data-from-a-csv-file} Before importing data, let's create a table with a relevant structure: @@ -25,17 +24,14 @@ ENGINE = MergeTree ORDER BY tuple(month, path) ``` - To import data from the [CSV file](assets/data_small.csv) to the `sometable` table, we can pipe our file directly to the clickhouse-client: - ```bash clickhouse-client -q "INSERT INTO sometable FORMAT CSV" < data_small.csv ``` Note that we use [FORMAT CSV](/interfaces/formats.md/#csv) to let ClickHouse know we're ingesting CSV formatted data. Alternatively, we can load data from a local file using the [FROM INFILE](/sql-reference/statements/insert-into.md/#inserting-data-from-a-file) clause: - ```sql INSERT INTO sometable FROM INFILE 'data_small.csv' @@ -74,24 +70,20 @@ In this case, ClickHouse skips the first row while importing data from the file. Starting from [version](https://github.com/ClickHouse/ClickHouse/releases) 23.1, ClickHouse will automatically detect headers in CSV files when using the `CSV` format, so it is not necessary to use `CSVWithNames` or `CSVWithNamesAndTypes`. ::: - ### CSV files with custom delimiters {#csv-files-with-custom-delimiters} In case the CSV file uses other than comma delimiter, we can use the [format_csv_delimiter](/operations/settings/settings-formats.md/#format_csv_delimiter) option to set the relevant symbol: - ```sql SET format_csv_delimiter = ';' ``` Now, when we import from a CSV file, `;` symbol is going to be used as a delimiter instead of a comma. - ### Skipping lines in a CSV file {#skipping-lines-in-a-csv-file} Sometimes, we might skip a certain number of lines while importing data from a CSV file. This can be done using [input_format_csv_skip_first_lines](/operations/settings/settings-formats.md/#input_format_csv_skip_first_lines) option: - ```sql SET input_format_csv_skip_first_lines = 10 ``` @@ -113,7 +105,6 @@ The [file](assets/data_small.csv) has 1k rows, but ClickHouse loaded only 990 si When using the `file()` function, with ClickHouse Cloud you will need to run the commands in `clickhouse client` on the machine where the file resides. Another option is to use [`clickhouse-local`](/operations/utilities/clickhouse-local.md) to explore files locally. ::: - ### Treating NULL values in CSV files {#treating-null-values-in-csv-files} Null values can be encoded differently depending on the application that generated the file. By default, ClickHouse uses `\N` as a Null value in CSV. But we can change that using the [format_csv_null_representation](/operations/settings/settings-formats.md/#format_tsv_null_representation) option. @@ -159,30 +150,24 @@ SELECT * FROM file('nulls.csv') └────────┴──────┘ ``` - ## TSV (tab-separated) files {#tsv-tab-separated-files} Tab-separated data format is widely used as a data interchange format. To load data from a [TSV file](assets/data_small.tsv) to ClickHouse, the [TabSeparated](/interfaces/formats.md/#tabseparated) format is used: - ```bash clickhouse-client -q "INSERT INTO sometable FORMAT TabSeparated" < data_small.tsv ``` - There's also a [TabSeparatedWithNames](/interfaces/formats.md/#tabseparatedwithnames) format to allow working with TSV files that have headers. And, like for CSV, we can skip the first X lines using the [input_format_tsv_skip_first_lines](/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) option. - ### Raw TSV {#raw-tsv} Sometimes, TSV files are saved without escaping tabs and line breaks. We should use [TabSeparatedRaw](/interfaces/formats.md/#tabseparatedraw) to handle such files. - ## Exporting to CSV {#exporting-to-csv} Any format in our previous examples can also be used to export data. To export data from a table (or a query) to a CSV format, we use the same `FORMAT` clause: - ```sql SELECT * FROM sometable @@ -214,7 +199,6 @@ FORMAT CSVWithNames "2016_Greater_Western_Sydney_Giants_season","2017-05-01",86 ``` - ### Saving exported data to a CSV file {#saving-exported-data-to-a-csv-file} To save exported data to a file, we can use the [INTO...OUTFILE](/sql-reference/statements/select/into-outfile.md) clause: @@ -231,7 +215,6 @@ FORMAT CSVWithNames Note how it took ClickHouse **~1** second to save 36m rows to a CSV file. - ### Exporting CSV with custom delimiters {#exporting-csv-with-custom-delimiters} If we want to have other than comma delimiters, we can use the [format_csv_delimiter](/operations/settings/settings-formats.md/#format_csv_delimiter) settings option for that: @@ -256,7 +239,6 @@ FORMAT CSV "2016_Greater_Western_Sydney_Giants_season"|"2017-05-01"|86 ``` - ### Exporting CSV for Windows {#exporting-csv-for-windows} If we want a CSV file to work fine in a Windows environment, we should consider enabling [output_format_csv_crlf_end_of_line](/operations/settings/settings-formats.md/#output_format_csv_crlf_end_of_line) option. This will use `\r\n` as a line breaks instead of `\n`: @@ -269,7 +251,6 @@ SET output_format_csv_crlf_end_of_line = 1; We might work with unknown CSV files in many cases, so we have to explore which types to use for columns. Clickhouse, by default, will try to guess data formats based on its analysis of a given CSV file. This is known as "Schema Inference". Detected data types can be explored using the `DESCRIBE` statement in pair with the [file()](/sql-reference/table-functions/file.md) function: - ```sql DESCRIBE file('data-small.csv', CSV) ``` @@ -281,10 +262,8 @@ DESCRIBE file('data-small.csv', CSV) └──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` - Here, ClickHouse could guess column types for our CSV file efficiently. If we don't want ClickHouse to guess, we can disable this with the following option: - ```sql SET input_format_csv_use_best_effort_in_schema_inference = 0 ``` @@ -311,10 +290,8 @@ FORMAT CSVWithNamesAndTypes "2016_Greater_Western_Sydney_Giants_season","2017-05-01",86 ``` - This format will include two header rows - one with column names and the other with column types. This will allow ClickHouse (and other apps) to identify column types when loading data from [such files](assets/data_csv_types.csv): - ```sql DESCRIBE file('data_csv_types.csv', CSVWithNamesAndTypes) ``` @@ -365,12 +342,10 @@ LIMIT 3 We can also use [CustomSeparatedWithNames](/interfaces/formats.md/#customseparatedwithnames) to get headers exported and imported correctly. Explore [regex and template](templates-regex.md) formats to deal with even more complex cases. - ## Working with large CSV files {#working-with-large-csv-files} CSV files can be large, and ClickHouse works efficiently with files of any size. Large files usually come compressed, and ClickHouse covers this with no need for decompression before processing. We can use a `COMPRESSION` clause during an insert: - ```sql INSERT INTO sometable FROM INFILE 'data_csv.csv.gz' @@ -399,4 +374,4 @@ ClickHouse introduces support for many formats, both text, and binary, to cover - [Native and binary formats](binary.md) - [SQL formats](sql.md) -And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without the need for Clickhouse server. + And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without the need for Clickhouse server. diff --git a/docs/integrations/data-ingestion/data-formats/json/formats.md b/docs/integrations/data-ingestion/data-formats/json/formats.md index a14049df1b3..1e4b5d92028 100644 --- a/docs/integrations/data-ingestion/data-formats/json/formats.md +++ b/docs/integrations/data-ingestion/data-formats/json/formats.md @@ -32,9 +32,9 @@ We provide examples of reading and loading JSON in other common formats below. ## Reading JSON as an object {#reading-json-as-an-object} -Our previous examples show how `JSONEachRow` reads newline-delimited JSON, with each line read as a separate object mapped to a table row and each key to a column. This is ideal for cases where the JSON is predictable with single types for each column. +Our previous examples show how `JSONEachRow` reads newline-delimited JSON, with each line read as a separate object mapped to a table row and each key to a column. This is ideal for cases where the JSON is predictable with single types for each column. -In contrast, `JSONAsObject` treats each line as a single `JSON` object and stores it in a single column, of type [`JSON`](/sql-reference/data-types/newjson), making it better suited for nested JSON payloads and cases where the keys are dynamic and have potentially more than one type. +In contrast, `JSONAsObject` treats each line as a single `JSON` object and stores it in a single column, of type [`JSON`](/sql-reference/data-types/newjson), making it better suited for nested JSON payloads and cases where the keys are dynamic and have potentially more than one type. Use `JSONEachRow` for row-wise inserts, and [`JSONAsObject`](/interfaces/formats/JSONAsObject) when storing flexible or dynamic JSON data. @@ -96,7 +96,7 @@ Code: 117. DB::Exception: JSON objects have ambiguous data: in some objects path To increase the maximum number of rows/bytes to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference. You can specify the structure manually: (in file/uri bluesky/file_0001.json.gz). (CANNOT_EXTRACT_TABLE_STRUCTURE) ``` - + Conversely, `JSONAsObject` can be used in this case as the `JSON` type supports multiple types for the same subcolumn. ```sql @@ -432,7 +432,6 @@ ClickHouse allows exporting to and importing data from [BSON](https://bsonspec.o To import BSON data, we use the [BSONEachRow](/interfaces/formats.md/#bsoneachrow) format. Let's import data from [this BSON file](../assets/data.bson): - ```sql SELECT * FROM file('data.bson', BSONEachRow) ``` diff --git a/docs/integrations/data-ingestion/data-formats/json/inference.md b/docs/integrations/data-ingestion/data-formats/json/inference.md index cd0504e3d01..de429b11677 100644 --- a/docs/integrations/data-ingestion/data-formats/json/inference.md +++ b/docs/integrations/data-ingestion/data-formats/json/inference.md @@ -12,7 +12,7 @@ ClickHouse can automatically determine the structure of JSON data. This can be u * **Consistent structure** - The data from which you are going to infer types contains all the keys that you are interested in. Type inference is based on sampling the data up to a [maximum number of rows](/operations/settings/formats#input_format_max_rows_to_read_for_schema_inference) or [bytes](/operations/settings/formats#input_format_max_bytes_to_read_for_schema_inference). Data after the sample, with additional columns, will be ignored and can't be queried. * **Consistent types** - Data types for specific keys need to be compatible i.e. it must be possible to coerce one type to the other automatically. -If you have more dynamic JSON, to which new keys are added and multiple types are possible for the same path, see ["Working with semi-structured and dynamic data"](/integrations/data-formats/json/inference#working-with-semi-structured-data). + If you have more dynamic JSON, to which new keys are added and multiple types are possible for the same path, see ["Working with semi-structured and dynamic data"](/integrations/data-formats/json/inference#working-with-semi-structured-data). ## Detecting types {#detecting-types} @@ -66,7 +66,6 @@ As well as detecting the schema, JSON schema inference will automatically infer Using the [s3 function](/sql-reference/table-functions/s3) with the `DESCRIBE` command shows the types that will be inferred. - ```sql DESCRIBE TABLE s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/arxiv/arxiv.json.gz') SETTINGS describe_compact_output = 1 @@ -287,7 +286,6 @@ If you know your JSON is highly dynamic with many unique keys and multiple types Consider the following example from an extended version of the above [Python PyPI dataset](https://clickpy.clickhouse.com/) dataset. Here we have added an arbitrary `tags` column with random key value pairs. - ```json { "date": "2022-09-22", @@ -315,11 +313,11 @@ DESCRIBE s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/pypi 9 rows in set. Elapsed: 127.066 sec. ``` -The primary issue here is that the `JSONEachRow` format is used for inference. This attempts to infer **a column type per key in the JSON** - effectively trying to apply a static schema to the data without using the [`JSON`](/sql-reference/data-types/newjson) type. +The primary issue here is that the `JSONEachRow` format is used for inference. This attempts to infer **a column type per key in the JSON** - effectively trying to apply a static schema to the data without using the [`JSON`](/sql-reference/data-types/newjson) type. With thousands of unique columns this approach to inference is slow. As an alternative, users can use the `JSONAsObject` format. -`JSONAsObject` treats the entire input as a single JSON object and stores it in a single column of type [`JSON`](/sql-reference/data-types/newjson), making it better suited for highly dynamic or nested JSON payloads. +`JSONAsObject` treats the entire input as a single JSON object and stores it in a single column of type [`JSON`](/sql-reference/data-types/newjson), making it better suited for highly dynamic or nested JSON payloads. ```sql DESCRIBE TABLE s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/pypi/pypi_with_tags/sample_rows.json.gz', 'JSONAsObject') diff --git a/docs/integrations/data-ingestion/data-formats/json/loading.md b/docs/integrations/data-ingestion/data-formats/json/loading.md index 6b2c97bd5d5..1f604badf14 100644 --- a/docs/integrations/data-ingestion/data-formats/json/loading.md +++ b/docs/integrations/data-ingestion/data-formats/json/loading.md @@ -31,9 +31,9 @@ Consider the following JSON sample, representing a row from the [Python PyPI dat } ``` -In order to load this JSON object into ClickHouse, a table schema must be defined. +In order to load this JSON object into ClickHouse, a table schema must be defined. -In this simple case, our structure is static, our column names are known, and their types are well-defined. +In this simple case, our structure is static, our column names are known, and their types are well-defined. Whereas ClickHouse supports semi-structured data through a JSON type, where key names and their types can be dynamic, this is unnecessary here. @@ -115,7 +115,6 @@ FORMAT JSONEachRow These examples assume the use of the `JSONEachRow` format. Other common JSON formats are supported, with examples of loading these provided [here](/integrations/data-formats/json/other-formats). - ## Loading semi-structured JSON {#loading-semi-structured-json} Our previous example loaded JSON which was static with well known key names and types. This is often not the case - keys can be added or their types can change. This is common in use cases such as Observability data. @@ -124,7 +123,6 @@ ClickHouse handles this through a dedicated [`JSON`](/sql-reference/data-types/n Consider the following example from an extended version of the above [Python PyPI dataset](https://clickpy.clickhouse.com/) dataset. Here we have added an arbitrary `tags` column with random key value pairs. - ```json { "date": "2022-09-22", @@ -191,7 +189,7 @@ LIMIT 2 2 rows in set. Elapsed: 0.149 sec. ``` -Notice the performance difference here on loading data. The JSON column requires type inference at insert time as well as additional storage if columns exist that have more than one type. Although the JSON type can be configured (see [Designing JSON schema](/integrations/data-formats/json/schema)) for equivalent performance to explicitly declaring columns, it is intentionally flexible out-of-the-box. This flexibility, however, comes at some cost. +Notice the performance difference here on loading data. The JSON column requires type inference at insert time as well as additional storage if columns exist that have more than one type. Although the JSON type can be configured (see [Designing JSON schema](/integrations/data-formats/json/schema)) for equivalent performance to explicitly declaring columns, it is intentionally flexible out-of-the-box. This flexibility, however, comes at some cost. ### When to use the JSON type {#when-to-use-the-json-type} @@ -201,10 +199,10 @@ Use the JSON type when your data: * Contains **values with varying types** (e.g., a path might sometimes contain a string, sometimes a number). * Requires schema flexibility where strict typing isn't viable. -If your data structure is known and consistent, there is rarely a need for the JSON type, even if your data is in JSON format. Specifically, if your data has: + If your data structure is known and consistent, there is rarely a need for the JSON type, even if your data is in JSON format. Specifically, if your data has: * **A flat structure with known keys**: use standard column types e.g. String. * **Predictable nesting**: use Tuple, Array, or Nested types for these structures. * **Predictable structure with varying types**: consider Dynamic or Variant types instead. -You can also mix approaches as we have done in the above example, using static columns for predictable top-level keys and a single JSON column for a dynamic section of the payload. + You can also mix approaches as we have done in the above example, using static columns for predictable top-level keys and a single JSON column for a dynamic section of the payload. diff --git a/docs/integrations/data-ingestion/data-formats/json/other.md b/docs/integrations/data-ingestion/data-formats/json/other.md index 6df2fb5b12f..8063fd6568c 100644 --- a/docs/integrations/data-ingestion/data-formats/json/other.md +++ b/docs/integrations/data-ingestion/data-formats/json/other.md @@ -166,41 +166,41 @@ A faster and more strict set of functions are available. These `simpleJSON*` fun "path": "/images/hm_bg.jpg", "version": "HTTP/1.0"}, "status": 200, "size": 24736} ``` -Whereas, the following will parse correctly: + Whereas, the following will parse correctly: -```json -{"@timestamp":893964617,"clientip":"40.135.0.0","request":{"method":"GET", + ```json + {"@timestamp":893964617,"clientip":"40.135.0.0","request":{"method":"GET", "path":"/images/hm_bg.jpg","version":"HTTP/1.0"},"status":200,"size":24736} -In some circumstances, where performance is critical and your JSON meets the above requirements, these may be appropriate. An example of the earlier query, re-written to use `simpleJSON*` functions, is shown below: + In some circumstances, where performance is critical and your JSON meets the above requirements, these may be appropriate. An example of the earlier query, re-written to use `simpleJSON*` functions, is shown below: -```sql -SELECT + ```sql + SELECT toYear(parseDateTimeBestEffort(simpleJSONExtractString(simpleJSONExtractRaw(body, 'versions'), 'created'))) AS published_year, count() AS c -FROM arxiv -GROUP BY published_year -ORDER BY published_year ASC -LIMIT 10 - -┌─published_year─┬─────c─┐ -│ 1986 │ 1 │ -│ 1988 │ 1 │ -│ 1989 │ 6 │ -│ 1990 │ 26 │ -│ 1991 │ 353 │ -│ 1992 │ 3190 │ -│ 1993 │ 6729 │ -│ 1994 │ 10078 │ -│ 1995 │ 13006 │ -│ 1996 │ 15872 │ -└────────────────┴───────┘ - -10 rows in set. Elapsed: 0.964 sec. Processed 2.48 million rows, 4.21 GB (2.58 million rows/s., 4.36 GB/s.) -Peak memory usage: 211.49 MiB. -``` + FROM arxiv + GROUP BY published_year + ORDER BY published_year ASC + LIMIT 10 + + ┌─published_year─┬─────c─┐ + │ 1986 │ 1 │ + │ 1988 │ 1 │ + │ 1989 │ 6 │ + │ 1990 │ 26 │ + │ 1991 │ 353 │ + │ 1992 │ 3190 │ + │ 1993 │ 6729 │ + │ 1994 │ 10078 │ + │ 1995 │ 13006 │ + │ 1996 │ 15872 │ + └────────────────┴───────┘ + + 10 rows in set. Elapsed: 0.964 sec. Processed 2.48 million rows, 4.21 GB (2.58 million rows/s., 4.36 GB/s.) + Peak memory usage: 211.49 MiB. + ``` -The above query uses the `simpleJSONExtractString` to extract the `created` key, exploiting the fact we want the first value only for the published date. In this case, the limitations of the `simpleJSON*` functions are acceptable for the gain in performance. + The above query uses the `simpleJSONExtractString` to extract the `created` key, exploiting the fact we want the first value only for the published date. In this case, the limitations of the `simpleJSON*` functions are acceptable for the gain in performance. ## Using the Map type {#using-map} @@ -212,9 +212,9 @@ Although `Map`s give a simple way to represent nested structures, they have some - Accessing sub-columns requires a special map syntax since the fields don't exist as columns. The entire object _is_ a column. - Accessing a subcolumn loads the entire `Map` value i.e. all siblings and their respective values. For larger maps, this can result in a significant performance penalty. -:::note String keys -When modelling objects as `Map`s, a `String` key is used to store the JSON key name. The map will therefore always be `Map(String, T)`, where `T` depends on the data. -::: + :::note String keys + When modelling objects as `Map`s, a `String` key is used to store the JSON key name. The map will therefore always be `Map(String, T)`, where `T` depends on the data. + ::: #### Primitive values {#primitive-values} @@ -351,13 +351,6 @@ The application of maps in this case is typically rare, and suggests that the da } ``` - - - - - - - ## Using the Nested type {#using-nested} The [Nested type](/sql-reference/data-types/nested-data-structures/nested) can be used to model static objects which are rarely subject to change, offering an alternative to `Tuple` and `Array(Tuple)`. We generally recommend avoiding using this type for JSON as its behavior is often confusing. The primary benefit of `Nested` is that sub-columns can be used in ordering keys. @@ -437,8 +430,8 @@ A few important points to note here: ``` * The nested fields `method`, `path`, and `version` need to be passed as JSON arrays i.e. - ```json - { + ```json + { "@timestamp": 897819077, "clientip": "45.212.12.0", "request": { @@ -454,21 +447,21 @@ A few important points to note here: }, "status": 200, "size": 3305 - } - ``` + } + ``` -Columns can be queried using a dot notation: + Columns can be queried using a dot notation: -```sql -SELECT clientip, status, size, `request.method` FROM http WHERE has(request.method, 'GET'); + ```sql + SELECT clientip, status, size, `request.method` FROM http WHERE has(request.method, 'GET'); -┌─clientip────┬─status─┬─size─┬─request.method─┐ -│ 45.212.12.0 │ 200 │ 3305 │ ['GET'] │ -└─────────────┴────────┴──────┴────────────────┘ -1 row in set. Elapsed: 0.002 sec. -``` + ┌─clientip────┬─status─┬─size─┬─request.method─┐ + │ 45.212.12.0 │ 200 │ 3305 │ ['GET'] │ + └─────────────┴────────┴──────┴────────────────┘ + 1 row in set. Elapsed: 0.002 sec. + ``` -Note the use of `Array` for the sub-columns means the full breath [Array functions](/sql-reference/functions/array-functions) can potentially be exploited, including the [`ARRAY JOIN`](/sql-reference/statements/select/array-join) clause - useful if your columns have multiple values. + Note the use of `Array` for the sub-columns means the full breath [Array functions](/sql-reference/functions/array-functions) can potentially be exploited, including the [`ARRAY JOIN`](/sql-reference/statements/select/array-join) clause - useful if your columns have multiple values. #### flatten_nested=0 {#flatten_nested0} @@ -515,8 +508,8 @@ A few important points to note here: * The `Nested` type is preserved in `SHOW CREATE TABLE`. Underneath this column is effectively a `Array(Tuple(Nested(method LowCardinality(String), path String, version LowCardinality(String))))` * As a result, we are required to insert `request` as an array i.e. - ```json - { + ```json + { "timestamp": 897819077, "clientip": "45.212.12.0", "request": [ @@ -528,19 +521,19 @@ A few important points to note here: ], "status": 200, "size": 3305 - } - ``` + } + ``` -Columns can again be queried using a dot notation: + Columns can again be queried using a dot notation: -```sql -SELECT clientip, status, size, `request.method` FROM http WHERE has(request.method, 'GET'); + ```sql + SELECT clientip, status, size, `request.method` FROM http WHERE has(request.method, 'GET'); -┌─clientip────┬─status─┬─size─┬─request.method─┐ -│ 45.212.12.0 │ 200 │ 3305 │ ['GET'] │ -└─────────────┴────────┴──────┴────────────────┘ -1 row in set. Elapsed: 0.002 sec. -``` + ┌─clientip────┬─status─┬─size─┬─request.method─┐ + │ 45.212.12.0 │ 200 │ 3305 │ ['GET'] │ + └─────────────┴────────┴──────┴────────────────┘ + 1 row in set. Elapsed: 0.002 sec. + ``` ### Example {#example} diff --git a/docs/integrations/data-ingestion/data-formats/json/schema.md b/docs/integrations/data-ingestion/data-formats/json/schema.md index fccd058ccfe..327196fb8dc 100644 --- a/docs/integrations/data-ingestion/data-formats/json/schema.md +++ b/docs/integrations/data-ingestion/data-formats/json/schema.md @@ -11,7 +11,6 @@ import json_column_per_type from '@site/static/images/integrations/data-ingestio import json_offsets from '@site/static/images/integrations/data-ingestion/data-formats/json_offsets.png'; import shared_json_column from '@site/static/images/integrations/data-ingestion/data-formats/json_shared_column.png'; - # Designing your schema While [schema inference](/integrations/data-formats/json/inference) can be used to establish an initial schema for JSON data and query JSON data files in place, e.g., in S3, users should aim to establish an optimized versioned schema for their data. We discuss the recommended approach for modeling JSON structures below. @@ -21,23 +20,23 @@ While [schema inference](/integrations/data-formats/json/inference) can be used The principal task on defining a schema for JSON is to determine the appropriate type for each key's value. We recommended users apply the following rules recursively on each key in the JSON hierarchy to determine the appropriate type for each key. 1. **Primitive types** - If the key's value is a primitive type, irrespective of whether it is part of a sub-object or on the root, ensure you select its type according to general schema [design best practices](/data-modeling/schema-design) and [type optimization rules](/data-modeling/schema-design#optimizing-types). Arrays of primitives, such as `phone_numbers` below, can be modeled as `Array()` e.g., `Array(String)`. -2. **Static vs dynamic** - If the key's value is a complex object i.e. either an object or an array of objects, establish whether it is subject to change. Objects that rarely have new keys, where the addition of a new key can be predicted and handled with a schema change via [`ALTER TABLE ADD COLUMN`](/sql-reference/statements/alter/column#add-column), can be considered **static**. This includes objects where only a subset of the keys may be provided on some JSON documents. Objects where new keys are added frequently and/or are not predictable should be considered **dynamic**. **The exception here is structures with hundreds or thousands of sub keys which can be considered dynamic for convenience purposes**. +2. **Static vs dynamic** - If the key's value is a complex object i.e. either an object or an array of objects, establish whether it is subject to change. Objects that rarely have new keys, where the addition of a new key can be predicted and handled with a schema change via [`ALTER TABLE ADD COLUMN`](/sql-reference/statements/alter/column#add-column), can be considered **static**. This includes objects where only a subset of the keys may be provided on some JSON documents. Objects where new keys are added frequently and/or are not predictable should be considered **dynamic**. **The exception here is structures with hundreds or thousands of sub keys which can be considered dynamic for convenience purposes**. -To establish whether a value is **static** or **dynamic**, see the relevant sections [**Handling static objects**](/integrations/data-formats/json/schema#handling-static-structures) and [**Handling dynamic objects**](/integrations/data-formats/json/schema#handling-semi-structured-dynamic-structures) below. + To establish whether a value is **static** or **dynamic**, see the relevant sections [**Handling static objects**](/integrations/data-formats/json/schema#handling-static-structures) and [**Handling dynamic objects**](/integrations/data-formats/json/schema#handling-semi-structured-dynamic-structures) below. -

+

-**Important:** The above rules should be applied recursively. If a key's value is determined to be dynamic, no further evaluation is required and the guidelines in [**Handling dynamic objects**](/integrations/data-formats/json/schema#handling-semi-structured-dynamic-structures) can be followed. If the object is static, continue to assess the subkeys until either key values are primitive or dynamic keys are encountered. + **Important:** The above rules should be applied recursively. If a key's value is determined to be dynamic, no further evaluation is required and the guidelines in [**Handling dynamic objects**](/integrations/data-formats/json/schema#handling-semi-structured-dynamic-structures) can be followed. If the object is static, continue to assess the subkeys until either key values are primitive or dynamic keys are encountered. -To illustrate these rules, we use the following JSON example representing a person: + To illustrate these rules, we use the following JSON example representing a person: -```json -{ - "id": 1, - "name": "Clicky McCliickHouse", - "username": "Clicky", - "email": "clicky@clickhouse.com", - "address": [ + ```json + { + "id": 1, + "name": "Clicky McCliickHouse", + "username": "Clicky", + "email": "clicky@clickhouse.com", + "address": [ { "street": "Victor Plains", "suite": "Suite 879", @@ -48,22 +47,22 @@ To illustrate these rules, we use the following JSON example representing a pers "lng": -34.4618 } } - ], - "phone_numbers": [ + ], + "phone_numbers": [ "010-692-6593", "020-192-3333" - ], - "website": "clickhouse.com", - "company": { + ], + "website": "clickhouse.com", + "company": { "name": "ClickHouse", "catchPhrase": "The real-time data warehouse for analytics", "labels": { "type": "database systems", "founded": "2021" } - }, - "dob": "2007-03-31", - "tags": { + }, + "dob": "2007-03-31", + "tags": { "hobby": "Databases", "holidays": [ { @@ -75,20 +74,20 @@ To illustrate these rules, we use the following JSON example representing a pers "model": "Tesla", "year": 2023 } - } -} -``` + } + } + ``` -Applying these rules: + Applying these rules: - The root keys `name`, `username`, `email`, `website` can be represented as type `String`. The column `phone_numbers` is an Array primitive of type `Array(String)`, with `dob` and `id` type `Date` and `UInt32` respectively. - New keys will not be added to the `address` object (only new address objects), and it can thus be considered **static**. If we recurse, all of the sub-columns can be considered primitives (and type `String`) except `geo`. This is also a static structure with two `Float32` columns, `lat` and `lon`. - The `tags` column is **dynamic**. We assume new arbitrary tags can be added to this object of any type and structure. - The `company` object is **static** and will always contain at most the 3 keys specified. The subkeys `name` and `catchPhrase` are of type `String`. The key `labels` is **dynamic**. We assume new arbitrary tags can be added to this object. Values will always be key-value pairs of type string. -:::note -Structures with hundreds or thousands of static keys can be considered dynamic, as it is rarely realistic to statically declare the columns for these. However, where possible [skip paths](#using-type-hints-and-skipping-paths) which are not needed to save both storage and inference overhead. -::: + :::note + Structures with hundreds or thousands of static keys can be considered dynamic, as it is rarely realistic to statically declare the columns for these. However, where possible [skip paths](#using-type-hints-and-skipping-paths) which are not needed to save both storage and inference overhead. + ::: ## Handling static structures {#handling-static-structures} @@ -200,7 +199,6 @@ ENGINE = MergeTree ORDER BY company.name ``` - ### Handling default values {#handling-default-values} Even if JSON objects are structured, they are often sparse with only a subset of the known keys provided. Fortunately, the `Tuple` type does not require all columns in the JSON payload. If not provided, default values will be used. @@ -367,18 +365,17 @@ More specifically, use the JSON type when your data: - Requires schema flexibility where strict typing isn't viable. - You have **hundreds or even thousands** of paths which are static but simply not realistic to declare explicitly. This tends to be a rare. -Consider our [earlier person JSON](/integrations/data-formats/json/schema#static-vs-dynamic-json) where the `company.labels` object was determined to be dynamic. + Consider our [earlier person JSON](/integrations/data-formats/json/schema#static-vs-dynamic-json) where the `company.labels` object was determined to be dynamic. -Let's suppose that `company.labels` contains arbitrary keys. Additionally, the type for any key in this structure may not be consistent between rows. For example: + Let's suppose that `company.labels` contains arbitrary keys. Additionally, the type for any key in this structure may not be consistent between rows. For example: - -```json -{ - "id": 1, - "name": "Clicky McCliickHouse", - "username": "Clicky", - "email": "clicky@clickhouse.com", - "address": [ + ```json + { + "id": 1, + "name": "Clicky McCliickHouse", + "username": "Clicky", + "email": "clicky@clickhouse.com", + "address": [ { "street": "Victor Plains", "suite": "Suite 879", @@ -389,13 +386,13 @@ Let's suppose that `company.labels` contains arbitrary keys. Additionally, the t "lng": -34.4618 } } - ], - "phone_numbers": [ + ], + "phone_numbers": [ "010-692-6593", "020-192-3333" - ], - "website": "clickhouse.com", - "company": { + ], + "website": "clickhouse.com", + "company": { "name": "ClickHouse", "catchPhrase": "The real-time data warehouse for analytics", "labels": { @@ -403,9 +400,9 @@ Let's suppose that `company.labels` contains arbitrary keys. Additionally, the t "founded": "2021", "employees": 250 } - }, - "dob": "2007-03-31", - "tags": { + }, + "dob": "2007-03-31", + "tags": { "hobby": "Databases", "holidays": [ { @@ -417,16 +414,16 @@ Let's suppose that `company.labels` contains arbitrary keys. Additionally, the t "model": "Tesla", "year": 2023 } - } -} -``` + } + } + ``` -```json -{ - "id": 2, - "name": "Analytica Rowe", - "username": "Analytica", - "address": [ + ```json + { + "id": 2, + "name": "Analytica Rowe", + "username": "Analytica", + "address": [ { "street": "Maple Avenue", "suite": "Apt. 402", @@ -437,13 +434,13 @@ Let's suppose that `company.labels` contains arbitrary keys. Additionally, the t "lng": -74.006 } } - ], - "phone_numbers": [ + ], + "phone_numbers": [ "123-456-7890", "555-867-5309" - ], - "website": "fastdata.io", - "company": { + ], + "website": "fastdata.io", + "company": { "name": "FastData Inc.", "catchPhrase": "Streamlined analytics at scale", "labels": { @@ -454,9 +451,9 @@ Let's suppose that `company.labels` contains arbitrary keys. Additionally, the t "dissolved": 2023, "employees": 10 } - }, - "dob": "1992-07-15", - "tags": { + }, + "dob": "1992-07-15", + "tags": { "hobby": "Running simulations", "holidays": [ { @@ -468,22 +465,22 @@ Let's suppose that `company.labels` contains arbitrary keys. Additionally, the t "model": "Audi e-tron", "year": 2022 } - } -} -``` + } + } + ``` -Given the dynamic nature of the `company.labels` column between objects, with respect to keys and types, we have several options to model this data: + Given the dynamic nature of the `company.labels` column between objects, with respect to keys and types, we have several options to model this data: - **Single JSON column** - represents the entire schema as a single `JSON` column, allowing all structures to be dynamic beneath this. - **Targeted JSON column** - only use the `JSON` type for the `company.labels` column, retaining the structured schema used above for all other columns. -While the first approach [does not align with previous methodology](#static-vs-dynamic-json), a single JSON column approach is useful for prototyping and data engineering tasks. + While the first approach [does not align with previous methodology](#static-vs-dynamic-json), a single JSON column approach is useful for prototyping and data engineering tasks. -For production deployments of ClickHouse at scale, we recommend being specific with structure and using the JSON type for targeted dynamic sub-structures where possible. + For production deployments of ClickHouse at scale, we recommend being specific with structure and using the JSON type for targeted dynamic sub-structures where possible. -A strict schema has a number of benefits: + A strict schema has a number of benefits: -- **Data validation** – enforcing a strict schema avoids the risk of column explosion, outside of specific structures. +- **Data validation** – enforcing a strict schema avoids the risk of column explosion, outside of specific structures. - **Avoids risk of column explosion** - Although the JSON type scales to potentially thousands of columns, where subcolumns are stored as dedicated columns, this can lead to a column file explosion where an excessive number of column files are created that impacts performance. To mitigate this, the underlying [Dynamic type](/sql-reference/data-types/dynamic) used by JSON offers a [`max_dynamic_paths`](/sql-reference/data-types/newjson#reading-json-paths-as-sub-columns) parameter, which limits the number of unique paths stored as separate column files. Once the threshold is reached, additional paths are stored in a shared column file using a compact encoded format, maintaining performance and storage efficiency while supporting flexible data ingestion. Accessing this shared column file is, however, not as performant. Note, however, that the JSON column can be used with [type hints](#using-type-hints-and-skipping-paths). "Hinted" columns will deliver the same performance as dedicated columns. - **Simpler introspection of paths and types** - Although the JSON type supports [introspection functions](/sql-reference/data-types/newjson#introspection-functions) to determine the types and paths that have been inferred, static structures can be simpler to explore e.g. with `DESCRIBE`. @@ -515,7 +512,7 @@ We provide a [type hint](#using-type-hints-and-skipping-paths) for the `username Inserting rows into the above table can be achieved using the `JSONAsObject` format: ```sql -INSERT INTO people FORMAT JSONAsObject +INSERT INTO people FORMAT JSONAsObject {"id":1,"name":"Clicky McCliickHouse","username":"Clicky","email":"clicky@clickhouse.com","address":[{"street":"Victor Plains","suite":"Suite 879","city":"Wisokyburgh","zipcode":"90566-7771","geo":{"lat":-43.9509,"lng":-34.4618}}],"phone_numbers":["010-692-6593","020-192-3333"],"website":"clickhouse.com","company":{"name":"ClickHouse","catchPhrase":"The real-time data warehouse for analytics","labels":{"type":"database systems","founded":"2021","employees":250}},"dob":"2007-03-31","tags":{"hobby":"Databases","holidays":[{"year":2024,"location":"Azores, Portugal"}],"car":{"model":"Tesla","year":2023}}} 1 row in set. Elapsed: 0.028 sec. @@ -526,7 +523,6 @@ INSERT INTO people FORMAT JSONAsObject 1 row in set. Elapsed: 0.004 sec. ``` - ```sql SELECT * FROM people @@ -637,7 +633,6 @@ FROM people In order to return nested sub-objects, the `^` is required. This is a design choice to avoid reading a high number of columns - unless explicitly requested. Objects accessed without `^` will return `NULL` as shown below: - ```sql -- sub objects will not be returned by default SELECT json.company.labels @@ -662,7 +657,6 @@ FROM people 2 rows in set. Elapsed: 0.004 sec. ``` - ### Targeted JSON column {#targeted-json-column} While useful in prototyping and data engineering challenges, we recommend using an explicit schema in production where possible. @@ -701,7 +695,6 @@ INSERT INTO people FORMAT JSONEachRow 1 row in set. Elapsed: 0.440 sec. ``` - ```sql SELECT * FROM people @@ -738,7 +731,6 @@ tags: {"hobby":"Databases","holidays":[{"year":2024,"location":"Azores, [Introspection functions](/sql-reference/data-types/newjson#introspection-functions) can be used to determine the inferred paths and types for the `company.labels` column. - ```sql SELECT JSONDynamicPathsWithTypes(company.labels) AS paths FROM people @@ -920,12 +912,11 @@ FORMAT PrettyJSONEachRow 2 rows in set. Elapsed: 0.004 sec. ``` +#### Optimizing performance with type hints {#optimizing-performance-with-type-hints} -#### Optimizing performance with type hints {#optimizing-performance-with-type-hints} - -Type hints offer more than just a way to avoid unnecessary type inference - they eliminate storage and processing indirection entirely, as well as allowing [optimal primitive types](/data-modeling/schema-design#optimizing-types) to be specified. JSON paths with type hints are always stored just like traditional columns, bypassing the need for [**discriminator columns**](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse#storage-extension-for-dynamically-changing-data) or dynamic resolution during query time. +Type hints offer more than just a way to avoid unnecessary type inference - they eliminate storage and processing indirection entirely, as well as allowing [optimal primitive types](/data-modeling/schema-design#optimizing-types) to be specified. JSON paths with type hints are always stored just like traditional columns, bypassing the need for [**discriminator columns**](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse#storage-extension-for-dynamically-changing-data) or dynamic resolution during query time. -This means that with well-defined type hints, nested JSON keys achieve the same performance and efficiency as if they were modeled as top-level columns from the outset. +This means that with well-defined type hints, nested JSON keys achieve the same performance and efficiency as if they were modeled as top-level columns from the outset. As a result, for datasets that are mostly consistent but still benefit from the flexibility of JSON, type hints provide a convenient way to preserve performance without needing to restructure your schema or ingest pipeline. diff --git a/docs/integrations/data-ingestion/data-formats/parquet.md b/docs/integrations/data-ingestion/data-formats/parquet.md index 4f7d8d086dd..0f3e65c1bdb 100644 --- a/docs/integrations/data-ingestion/data-formats/parquet.md +++ b/docs/integrations/data-ingestion/data-formats/parquet.md @@ -73,7 +73,6 @@ ORDER BY (date, path); Now we can import data using the `FROM INFILE` clause: - ```sql INSERT INTO sometable FROM INFILE 'data.parquet' FORMAT Parquet; @@ -129,7 +128,6 @@ DESCRIBE TABLE imported_from_parquet; By default, ClickHouse is strict with column names, types, and values. But sometimes, we can skip nonexistent columns or unsupported values during import. This can be managed with [Parquet settings](/interfaces/formats/Parquet#format-settings). - ## Exporting to Parquet format {#exporting-to-parquet-format} :::tip @@ -181,7 +179,6 @@ FROM file('time.parquet', Parquet); └───┴─────────────────────┘ ``` - ## Further reading {#further-reading} ClickHouse introduces support for many formats, both text, and binary, to cover various scenarios and platforms. Explore more formats and ways to work with them in the following articles: @@ -193,4 +190,4 @@ ClickHouse introduces support for many formats, both text, and binary, to cover - [Native and binary formats](binary.md) - [SQL formats](sql.md) -And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without the need for Clickhouse server. + And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without the need for Clickhouse server. diff --git a/docs/integrations/data-ingestion/data-formats/sql.md b/docs/integrations/data-ingestion/data-formats/sql.md index 3d989926ca3..4f349305708 100644 --- a/docs/integrations/data-ingestion/data-formats/sql.md +++ b/docs/integrations/data-ingestion/data-formats/sql.md @@ -51,7 +51,6 @@ SELECT * FROM some_data LIMIT 3 FORMAT Values ('Bangor_City_Forest','2015-07-01',34),('Alireza_Afzal','2017-02-01',24),('Akhaura-Laksam-Chittagong_Line','2015-09-01',30) ``` - ## Inserting data from SQL dumps {#inserting-data-from-sql-dumps} To read SQL dumps, [MySQLDump](/interfaces/formats.md/#mysqldump) is used: @@ -101,7 +100,6 @@ DESCRIBE TABLE table_from_mysql; └───────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` - ## Other formats {#other-formats} ClickHouse introduces support for many formats, both text, and binary, to cover various scenarios and platforms. Explore more formats and ways to work with them in the following articles: @@ -113,4 +111,4 @@ ClickHouse introduces support for many formats, both text, and binary, to cover - [Native and binary formats](binary.md) - **SQL formats** -And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without the need for ClickHouse server. + And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without the need for ClickHouse server. diff --git a/docs/integrations/data-ingestion/data-formats/templates-regex.md b/docs/integrations/data-ingestion/data-formats/templates-regex.md index c3f4fc18f4b..a29f3164a5f 100644 --- a/docs/integrations/data-ingestion/data-formats/templates-regex.md +++ b/docs/integrations/data-ingestion/data-formats/templates-regex.md @@ -245,4 +245,4 @@ ClickHouse introduces support for many formats, both text, and binary, to cover - [Native and binary formats](binary.md) - [SQL formats](sql.md) -And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without the need for Clickhouse server. + And also check [clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) - a portable full-featured tool to work on local/remote files without the need for Clickhouse server. diff --git a/docs/integrations/data-ingestion/data-ingestion-index.md b/docs/integrations/data-ingestion/data-ingestion-index.md index ab0de39e9f5..0f88aa8e1a4 100644 --- a/docs/integrations/data-ingestion/data-ingestion-index.md +++ b/docs/integrations/data-ingestion/data-ingestion-index.md @@ -24,4 +24,3 @@ For more information check out the pages below: | [Fivetran](/integrations/fivetran) | An automated data movement platform moving data out of, into and across your cloud data platforms. | | [NiFi](/integrations/nifi) | An open-source workflow management software designed to automate data flow between software systems. | | [Vector](/integrations/vector) | A high-performance observability data pipeline that puts organizations in control of their observability data. | - diff --git a/docs/integrations/data-ingestion/dbms/dynamodb/index.md b/docs/integrations/data-ingestion/dbms/dynamodb/index.md index f67bf5787ef..e32a45896b8 100644 --- a/docs/integrations/data-ingestion/dbms/dynamodb/index.md +++ b/docs/integrations/data-ingestion/dbms/dynamodb/index.md @@ -23,7 +23,7 @@ This page covers how set up CDC from DynamoDB to ClickHouse using ClickPipes. Th 1. The initial snapshot via S3 ClickPipes 2. Real-time updates via Kinesis ClickPipes -Data will be ingested into a `ReplacingMergeTree`. This table engine is commonly used for CDC scenarios to allow update operations to be applied. More on this pattern can be found in the following blog articles: + Data will be ingested into a `ReplacingMergeTree`. This table engine is commonly used for CDC scenarios to allow update operations to be applied. More on this pattern can be found in the following blog articles: * [Change Data Capture (CDC) with PostgreSQL and ClickHouse - Part 1](https://clickhouse.com/blog/clickhouse-postgresql-change-data-capture-cdc-part-1?loc=docs-rockest-migrations) * [Change Data Capture (CDC) with PostgreSQL and ClickHouse - Part 2](https://clickhouse.com/blog/clickhouse-postgresql-change-data-capture-cdc-part-2?loc=docs-rockest-migrations) @@ -68,56 +68,55 @@ We will want to create three tables: 2. A table to store the final flattened data (destination table) 3. A materialized view to flatten the data + For the example DynamoDB data above, the ClickHouse tables would look like this: -For the example DynamoDB data above, the ClickHouse tables would look like this: - -```sql -/* Snapshot table */ -CREATE TABLE IF NOT EXISTS "default"."snapshot" -( + ```sql + /* Snapshot table */ + CREATE TABLE IF NOT EXISTS "default"."snapshot" + ( `item` String -) -ORDER BY tuple(); + ) + ORDER BY tuple(); -/* Table for final flattened data */ -CREATE MATERIALIZED VIEW IF NOT EXISTS "default"."snapshot_mv" TO "default"."destination" AS -SELECT + /* Table for final flattened data */ + CREATE MATERIALIZED VIEW IF NOT EXISTS "default"."snapshot_mv" TO "default"."destination" AS + SELECT JSONExtractString(item, 'id', 'S') AS id, JSONExtractInt(item, 'age', 'N') AS age, JSONExtractString(item, 'first_name', 'S') AS first_name -FROM "default"."snapshot"; + FROM "default"."snapshot"; -/* Table for final flattened data */ -CREATE TABLE IF NOT EXISTS "default"."destination" ( + /* Table for final flattened data */ + CREATE TABLE IF NOT EXISTS "default"."destination" ( "id" String, "first_name" String, "age" Int8, "version" Int64 -) -ENGINE ReplacingMergeTree("version") -ORDER BY id; -``` + ) + ENGINE ReplacingMergeTree("version") + ORDER BY id; + ``` -There are a few requirements for the destination table: + There are a few requirements for the destination table: - This table must be a `ReplacingMergeTree` table - The table must have a `version` column - - In later steps, we will be mapping the `ApproximateCreationDateTime` field from the Kinesis stream to the `version` column. + - In later steps, we will be mapping the `ApproximateCreationDateTime` field from the Kinesis stream to the `version` column. - The table should use the partition key as the sorting key (specified by `ORDER BY`) - - Rows with the same sorting key will be deduplicated based on the `version` column. + - Rows with the same sorting key will be deduplicated based on the `version` column. ### Create the snapshot ClickPipe {#create-the-snapshot-clickpipe} Now you can create a ClickPipe to load the snapshot data from S3 into ClickHouse. Follow the S3 ClickPipe guide [here](/integrations/data-ingestion/clickpipes/object-storage.md), but use the following settings: - **Ingest path**: You will need to locate the path of the exported json files in S3. The path will look something like this: -```text -https://{bucket}.s3.amazonaws.com/{prefix}/AWSDynamoDB/{export-id}/data/* -``` + ```text + https://{bucket}.s3.amazonaws.com/{prefix}/AWSDynamoDB/{export-id}/data/* + ``` - **Format**: JSONEachRow - **Table**: Your snapshot table (e.g. `default.snapshot` in example above) -Once created, data will begin populating in the snapshot and destination tables. You do not need to wait for the snapshot load to finish before moving on to the next step. + Once created, data will begin populating in the snapshot and destination tables. You do not need to wait for the snapshot load to finish before moving on to the next step. ## 4. Create the Kinesis ClickPipe {#4-create-the-kinesis-clickpipe} @@ -127,10 +126,10 @@ Now we can set up the Kinesis ClickPipe to capture real-time changes from the Ki - **Table**: Your destination table (e.g. `default.destination` in example above) - **Flatten object**: true - **Column mappings**: - - `ApproximateCreationDateTime`: `version` - - Map other fields to the appropriate destination columns as shown below + - `ApproximateCreationDateTime`: `version` + - Map other fields to the appropriate destination columns as shown below - + ## 5. Cleanup (optional) {#5-cleanup-optional} diff --git a/docs/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md b/docs/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md index 65a73d4bdac..c7c394a39ca 100644 --- a/docs/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md +++ b/docs/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md @@ -38,7 +38,7 @@ You have access to a machine that has: 3. a current version of **Java** (e.g. OpenJDK Version >= 17) installed 4. a current version of **MySQL** (e.g. MySQL Version >=8) installed and running 5. a current version of **ClickHouse** [installed](/getting-started/install/install.mdx) and running -::: + ::: ## Install the ClickHouse JDBC Bridge locally {#install-the-clickhouse-jdbc-bridge-locally} @@ -85,19 +85,16 @@ in the config file above - in the value for the `jdbcUrl` you need to replace ``, and `` with appropriate values according to your running MySQL instance, e.g. `"jdbc:mysql://localhost:3306"` - you need to replace `` and `` with your MySQL credentials, if you don't use a password, you can delete the `"password": ""` line in the config file above - in the value for `driverUrls` we just specified a URL from which the current version of the MySQL JDBC driver can be downloaded. That's all we have to do, and the ClickHouse JDBC Bridge will automatically download that JDBC driver (into a OS specific directory). -::: - -
- -Now we are ready to start the ClickHouse JDBC Bridge: - ```bash - cd ~/clickhouse-jdbc-bridge - java -jar clickhouse-jdbc-bridge-2.0.7-shaded.jar - ``` -:::note -We started the ClickHouse JDBC Bridge in foreground mode. In order to stop the Bridge you can bring the Unix shell window from above in foreground and press `CTRL+C`. -::: + ::: + Now we are ready to start the ClickHouse JDBC Bridge: + ```bash + cd ~/clickhouse-jdbc-bridge + java -jar clickhouse-jdbc-bridge-2.0.7-shaded.jar + ``` + :::note + We started the ClickHouse JDBC Bridge in foreground mode. In order to stop the Bridge you can bring the Unix shell window from above in foreground and press `CTRL+C`. + ::: ## Use the JDBC connection from within ClickHouse {#use-the-jdbc-connection-from-within-clickhouse} @@ -105,40 +102,30 @@ ClickHouse can now access MySQL data by either using the [jdbc table function](/ The easiest way to execute the following examples is to copy and paste them into the [`clickhouse-client`](/interfaces/cli.md) or into the [Play UI](/interfaces/http.md). - - - jdbc Table Function: - ```sql - SELECT * FROM jdbc('mysql8', 'mydatabase', 'mytable'); - ``` -:::note -As the first parameter for the jdbc table function we are using the name of the named data source that we configured above. -::: - - + ```sql + SELECT * FROM jdbc('mysql8', 'mydatabase', 'mytable'); + ``` + :::note + As the first parameter for the jdbc table function we are using the name of the named data source that we configured above. + ::: - JDBC Table Engine: - ```sql - CREATE TABLE mytable ( + ```sql + CREATE TABLE mytable ( , ... - ) - ENGINE = JDBC('mysql8', 'mydatabase', 'mytable'); - - SELECT * FROM mytable; - ``` -:::note - As the first parameter for the jdbc engine clause we are using the name of the named data source that we configured above - - The schema of the ClickHouse JDBC engine table and schema of the connected MySQL table must be aligned, e.g. the column names and order must be the same, and the column data types must be compatible -::: - - - - + ) + ENGINE = JDBC('mysql8', 'mydatabase', 'mytable'); + SELECT * FROM mytable; + ``` + :::note + As the first parameter for the jdbc engine clause we are using the name of the named data source that we configured above + The schema of the ClickHouse JDBC engine table and schema of the connected MySQL table must be aligned, e.g. the column names and order must be the same, and the column data types must be compatible + ::: ## Install the ClickHouse JDBC Bridge externally {#install-the-clickhouse-jdbc-bridge-externally} @@ -148,53 +135,42 @@ This has the advantage that each ClickHouse host can access the JDBC Bridge. Oth In order to install the ClickHouse JDBC Bridge externally, we do the following steps: - 1. We install, configure and run the ClickHouse JDBC Bridge on a dedicated host by following the steps described in section 1 of this guide. 2. On each ClickHouse Host we add the following configuration block to the ClickHouse server configuration (depending on your chosen configuration format, use either the XML or YAML version): - ```xml - JDBC-Bridge-Host - 9019 +JDBC-Bridge-Host +9019 ``` - - ```yaml jdbc_bridge: - host: JDBC-Bridge-Host - port: 9019 +host: JDBC-Bridge-Host +port: 9019 ``` - -:::note - - you need to replace `JDBC-Bridge-Host` with the hostname or ip address of the dedicated ClickHouse JDBC Bridge host - - we specified the default ClickHouse JDBC Bridge port `9019`, if you are using a different port for the JDBC Bridge then you must adapt the configuration above accordingly -::: - - - - -[//]: # (## 4. Additional Info) - -[//]: # () -[//]: # (TODO: ) - -[//]: # (- mention that for jdbc table function it is more performant (not two queries each time) to also specify the schema as a parameter) + :::note + - you need to replace `JDBC-Bridge-Host` with the hostname or ip address of the dedicated ClickHouse JDBC Bridge host + - we specified the default ClickHouse JDBC Bridge port `9019`, if you are using a different port for the JDBC Bridge then you must adapt the configuration above accordingly + ::: -[//]: # () -[//]: # (- mention ad hoc query vs table query, saved query, named query) + [//]: # (## 4. Additional Info) -[//]: # () -[//]: # (- mention insert into ) + [//]: # () + [//]: # (TODO: ) + [//]: # (- mention that for jdbc table function it is more performant (not two queries each time) to also specify the schema as a parameter) + [//]: # () + [//]: # (- mention ad hoc query vs table query, saved query, named query) + [//]: # () + [//]: # (- mention insert into ) diff --git a/docs/integrations/data-ingestion/dbms/mysql/index.md b/docs/integrations/data-ingestion/dbms/mysql/index.md index 19c486d48c9..6838bcb0f64 100644 --- a/docs/integrations/data-ingestion/dbms/mysql/index.md +++ b/docs/integrations/data-ingestion/dbms/mysql/index.md @@ -25,127 +25,127 @@ The `MySQL` table engine allows you to connect ClickHouse to MySQL. **SELECT** a ### 1. Configure MySQL {#1-configure-mysql} -1. Create a database in MySQL: - ```sql - CREATE DATABASE db1; - ``` +1. Create a database in MySQL: + ```sql + CREATE DATABASE db1; + ``` 2. Create a table: - ```sql - CREATE TABLE db1.table1 ( + ```sql + CREATE TABLE db1.table1 ( id INT, column1 VARCHAR(255) - ); - ``` + ); + ``` 3. Insert sample rows: - ```sql - INSERT INTO db1.table1 + ```sql + INSERT INTO db1.table1 (id, column1) - VALUES + VALUES (1, 'abc'), (2, 'def'), (3, 'ghi'); - ``` + ``` 4. Create a user to connect from ClickHouse: - ```sql - CREATE USER 'mysql_clickhouse'@'%' IDENTIFIED BY 'Password123!'; - ``` + ```sql + CREATE USER 'mysql_clickhouse'@'%' IDENTIFIED BY 'Password123!'; + ``` 5. Grant privileges as needed. (For demonstration purposes, the `mysql_clickhouse` user is granted admin privileges.) - ```sql - GRANT ALL PRIVILEGES ON *.* TO 'mysql_clickhouse'@'%'; - ``` + ```sql + GRANT ALL PRIVILEGES ON *.* TO 'mysql_clickhouse'@'%'; + ``` -:::note -If you are using this feature in ClickHouse Cloud, you may need the to allow the ClickHouse Cloud IP addresses to access your MySQL instance. -Check the ClickHouse [Cloud Endpoints API](//cloud/get-started/query-endpoints.md) for egress traffic details. -::: + :::note + If you are using this feature in ClickHouse Cloud, you may need the to allow the ClickHouse Cloud IP addresses to access your MySQL instance. + Check the ClickHouse [Cloud Endpoints API](//cloud/get-started/query-endpoints.md) for egress traffic details. + ::: ### 2. Define a Table in ClickHouse {#2-define-a-table-in-clickhouse} 1. Now let's create a ClickHouse table that uses the `MySQL` table engine: - ```sql - CREATE TABLE mysql_table1 ( + ```sql + CREATE TABLE mysql_table1 ( id UInt64, column1 String - ) - ENGINE = MySQL('mysql-host.domain.com','db1','table1','mysql_clickhouse','Password123!') - ``` + ) + ENGINE = MySQL('mysql-host.domain.com','db1','table1','mysql_clickhouse','Password123!') + ``` - The minimum parameters are: + The minimum parameters are: - |parameter|Description |example | - |---------|----------------------------|---------------------| - |host |hostname or IP |mysql-host.domain.com| - |database |mysql database name |db1 | - |table |mysql table name |table1 | - |user |username to connect to mysql|mysql_clickhouse | - |password |password to connect to mysql|Password123! | + |parameter|Description |example | + |---------|----------------------------|---------------------| + |host |hostname or IP |mysql-host.domain.com| + |database |mysql database name |db1 | + |table |mysql table name |table1 | + |user |username to connect to mysql|mysql_clickhouse | + |password |password to connect to mysql|Password123! | - :::note - View the [MySQL table engine](/engines/table-engines/integrations/mysql.md) doc page for a complete list of parameters. - ::: + :::note + View the [MySQL table engine](/engines/table-engines/integrations/mysql.md) doc page for a complete list of parameters. + ::: ### 3. Test the Integration {#3-test-the-integration} 1. In MySQL, insert a sample row: - ```sql - INSERT INTO db1.table1 + ```sql + INSERT INTO db1.table1 (id, column1) - VALUES + VALUES (4, 'jkl'); - ``` + ``` 2. Notice the existing rows from the MySQL table are in the ClickHouse table, along with the new row you just added: - ```sql - SELECT + ```sql + SELECT id, column1 - FROM mysql_table1 - ``` + FROM mysql_table1 + ``` - You should see 4 rows: - ```response - Query id: 6d590083-841e-4e95-8715-ef37d3e95197 + You should see 4 rows: + ```response + Query id: 6d590083-841e-4e95-8715-ef37d3e95197 - ┌─id─┬─column1─┐ - │ 1 │ abc │ - │ 2 │ def │ - │ 3 │ ghi │ - │ 4 │ jkl │ - └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 1 │ abc │ + │ 2 │ def │ + │ 3 │ ghi │ + │ 4 │ jkl │ + └────┴─────────┘ - 4 rows in set. Elapsed: 0.044 sec. - ``` + 4 rows in set. Elapsed: 0.044 sec. + ``` 3. Let's add a row to the ClickHouse table: - ```sql - INSERT INTO mysql_table1 + ```sql + INSERT INTO mysql_table1 (id, column1) - VALUES + VALUES (5,'mno') - ``` - -4. Notice the new row appears in MySQL: - ```bash - mysql> select id,column1 from db1.table1; - ``` - - You should see the new row: - ```response - +------+---------+ - | id | column1 | - +------+---------+ - | 1 | abc | - | 2 | def | - | 3 | ghi | - | 4 | jkl | - | 5 | mno | - +------+---------+ - 5 rows in set (0.01 sec) - ``` + ``` + +4. Notice the new row appears in MySQL: + ```bash + mysql> select id,column1 from db1.table1; + ``` + + You should see the new row: + ```response + +------+---------+ + | id | column1 | + +------+---------+ + | 1 | abc | + | 2 | def | + | 3 | ghi | + | 4 | jkl | + | 5 | mno | + +------+---------+ + 5 rows in set (0.01 sec) + ``` ### Summary {#summary} diff --git a/docs/integrations/data-ingestion/dbms/odbc-with-clickhouse.md b/docs/integrations/data-ingestion/dbms/odbc-with-clickhouse.md index 09bea4169a8..73aae819a8e 100644 --- a/docs/integrations/data-ingestion/dbms/odbc-with-clickhouse.md +++ b/docs/integrations/data-ingestion/dbms/odbc-with-clickhouse.md @@ -9,5 +9,3 @@ description: 'Page describing the ODBC integration' import Content from '@site/docs/engines/table-engines/integrations/odbc.md'; - - diff --git a/docs/integrations/data-ingestion/dbms/postgresql/connecting-to-postgresql.md b/docs/integrations/data-ingestion/dbms/postgresql/connecting-to-postgresql.md index 58513cc9844..cd3bb13e6fa 100644 --- a/docs/integrations/data-ingestion/dbms/postgresql/connecting-to-postgresql.md +++ b/docs/integrations/data-ingestion/dbms/postgresql/connecting-to-postgresql.md @@ -24,162 +24,161 @@ The `PostgreSQL` table engine allows **SELECT** and **INSERT** operations on dat This article is to illustrate basic methods of integration using one table. ### 1. Setting up PostgreSQL {#1-setting-up-postgresql} -1. In `postgresql.conf`, add the following entry to enable PostgreSQL to listen on the network interfaces: - ```text - listen_addresses = '*' - ``` +1. In `postgresql.conf`, add the following entry to enable PostgreSQL to listen on the network interfaces: + ```text + listen_addresses = '*' + ``` 2. Create a user to connect from ClickHouse. For demonstration purposes, this example grants full superuser rights. - ```sql - CREATE ROLE clickhouse_user SUPERUSER LOGIN PASSWORD 'ClickHouse_123'; - ``` + ```sql + CREATE ROLE clickhouse_user SUPERUSER LOGIN PASSWORD 'ClickHouse_123'; + ``` 3. Create a new database in PostgreSQL: - ```sql - CREATE DATABASE db_in_psg; - ``` + ```sql + CREATE DATABASE db_in_psg; + ``` 4. Create a new table: - ```sql - CREATE TABLE table1 ( + ```sql + CREATE TABLE table1 ( id integer primary key, column1 varchar(10) - ); - ``` + ); + ``` 5. Let's add a few rows for testing: - ```sql - INSERT INTO table1 + ```sql + INSERT INTO table1 (id, column1) - VALUES + VALUES (1, 'abc'), (2, 'def'); - ``` + ``` 6. To configure PostgreSQL to allow connections to the new database with the new user for replication, add the following entry to the `pg_hba.conf` file. Update the address line with either the subnet or IP address of your PostgreSQL server: - ```text - # TYPE DATABASE USER ADDRESS METHOD - host db_in_psg clickhouse_user 192.168.1.0/24 password - ``` + ```text + # TYPE DATABASE USER ADDRESS METHOD + host db_in_psg clickhouse_user 192.168.1.0/24 password + ``` 7. Reload the `pg_hba.conf` configuration (adjust this command depending on your version): - ```text - /usr/pgsql-12/bin/pg_ctl reload - ``` + ```text + /usr/pgsql-12/bin/pg_ctl reload + ``` 8. Verify the new `clickhouse_user` can login: - ```text - psql -U clickhouse_user -W -d db_in_psg -h - ``` + ```text + psql -U clickhouse_user -W -d db_in_psg -h + ``` -:::note -If you are using this feature in ClickHouse Cloud, you may need the to allow the ClickHouse Cloud IP addresses to access your PostgreSQL instance. -Check the ClickHouse [Cloud Endpoints API](/cloud/get-started/query-endpoints) for egress traffic details. -::: + :::note + If you are using this feature in ClickHouse Cloud, you may need the to allow the ClickHouse Cloud IP addresses to access your PostgreSQL instance. + Check the ClickHouse [Cloud Endpoints API](/cloud/get-started/query-endpoints) for egress traffic details. + ::: ### 2. Define a Table in ClickHouse {#2-define-a-table-in-clickhouse} 1. Login to the `clickhouse-client`: - ```bash - clickhouse-client --user default --password ClickHouse123! - ``` + ```bash + clickhouse-client --user default --password ClickHouse123! + ``` 2. Let's create a new database: - ```sql - CREATE DATABASE db_in_ch; - ``` + ```sql + CREATE DATABASE db_in_ch; + ``` 3. Create a table that uses the `PostgreSQL`: - ```sql - CREATE TABLE db_in_ch.table1 - ( + ```sql + CREATE TABLE db_in_ch.table1 + ( id UInt64, column1 String - ) - ENGINE = PostgreSQL('postgres-host.domain.com:5432', 'db_in_psg', 'table1', 'clickhouse_user', 'ClickHouse_123'); - ``` - - The minimum parameters needed are: + ) + ENGINE = PostgreSQL('postgres-host.domain.com:5432', 'db_in_psg', 'table1', 'clickhouse_user', 'ClickHouse_123'); + ``` - |parameter|Description |example | - |---------|----------------------------|---------------------| - |host:port|hostname or IP and port |postgres-host.domain.com:5432| - |database |PostgreSQL database name |db_in_psg | - |user |username to connect to postgres|clickhouse_user | - |password |password to connect to postgres|ClickHouse_123 | + The minimum parameters needed are: - :::note - View the [PostgreSQL table engine](/engines/table-engines/integrations/postgresql) doc page for a complete list of parameters. - ::: + |parameter|Description |example | + |---------|----------------------------|---------------------| + |host:port|hostname or IP and port |postgres-host.domain.com:5432| + |database |PostgreSQL database name |db_in_psg | + |user |username to connect to postgres|clickhouse_user | + |password |password to connect to postgres|ClickHouse_123 | + :::note + View the [PostgreSQL table engine](/engines/table-engines/integrations/postgresql) doc page for a complete list of parameters. + ::: ### 3 Test the Integration {#3-test-the-integration} 1. In ClickHouse, view initial rows: - ```sql - SELECT * FROM db_in_ch.table1 - ``` + ```sql + SELECT * FROM db_in_ch.table1 + ``` - The ClickHouse table should automatically be populated with the two rows that already existed in the table in PostgreSQL: - ```response - Query id: 34193d31-fe21-44ac-a182-36aaefbd78bf + The ClickHouse table should automatically be populated with the two rows that already existed in the table in PostgreSQL: + ```response + Query id: 34193d31-fe21-44ac-a182-36aaefbd78bf - ┌─id─┬─column1─┐ - │ 1 │ abc │ - │ 2 │ def │ - └────┴─────────┘ - ``` + ┌─id─┬─column1─┐ + │ 1 │ abc │ + │ 2 │ def │ + └────┴─────────┘ + ``` 2. Back in PostgreSQL, add a couple of rows to the table: - ```sql - INSERT INTO table1 + ```sql + INSERT INTO table1 (id, column1) - VALUES + VALUES (3, 'ghi'), (4, 'jkl'); - ``` + ``` 4. Those two new rows should appear in your ClickHouse table: - ```sql - SELECT * FROM db_in_ch.table1 - ``` - - The response should be: - ```response - Query id: 86fa2c62-d320-4e47-b564-47ebf3d5d27b - - ┌─id─┬─column1─┐ - │ 1 │ abc │ - │ 2 │ def │ - │ 3 │ ghi │ - │ 4 │ jkl │ - └────┴─────────┘ - ``` + ```sql + SELECT * FROM db_in_ch.table1 + ``` + + The response should be: + ```response + Query id: 86fa2c62-d320-4e47-b564-47ebf3d5d27b + + ┌─id─┬─column1─┐ + │ 1 │ abc │ + │ 2 │ def │ + │ 3 │ ghi │ + │ 4 │ jkl │ + └────┴─────────┘ + ``` 5. Let's see what happens when you add rows to the ClickHouse table: - ```sql - INSERT INTO db_in_ch.table1 + ```sql + INSERT INTO db_in_ch.table1 (id, column1) - VALUES + VALUES (5, 'mno'), (6, 'pqr'); - ``` + ``` 6. The rows added in ClickHouse should appear in the table in PostgreSQL: - ```sql - db_in_psg=# SELECT * FROM table1; - id | column1 - ----+--------- + ```sql + db_in_psg=# SELECT * FROM table1; + id | column1 + ----+--------- 1 | abc 2 | def 3 | ghi 4 | jkl 5 | mno 6 | pqr - (6 rows) - ``` + (6 rows) + ``` -This example demonstrated the basic integration between PostgreSQL and ClickHouse using the `PostrgeSQL` table engine. -Check out the [doc page for the PostgreSQL table engine](/engines/table-engines/integrations/postgresql) for more features, such as specifying schemas, returning only a subset of columns, and connecting to multiple replicas. Also check out the [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres) blog. + This example demonstrated the basic integration between PostgreSQL and ClickHouse using the `PostrgeSQL` table engine. + Check out the [doc page for the PostgreSQL table engine](/engines/table-engines/integrations/postgresql) for more features, such as specifying schemas, returning only a subset of columns, and connecting to multiple replicas. Also check out the [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres) blog. ## Using the MaterializedPostgreSQL database engine {#using-the-materializedpostgresql-database-engine} @@ -192,149 +191,148 @@ This article is to illustrate basic methods of integration using one database, o ***In the following procedures, the PostgreSQL CLI (psql) and the ClickHouse CLI (clickhouse-client) are used. The PostgreSQL server is installed on linux. The following has minimum settings if the postgresql database is new test install*** ### 1. In PostgreSQL {#1-in-postgresql} -1. In `postgresql.conf`, set minimum listen levels, replication wal level and replication slots: +1. In `postgresql.conf`, set minimum listen levels, replication wal level and replication slots: -add the following entries: -```text -listen_addresses = '*' -max_replication_slots = 10 -wal_level = logical -``` -_*ClickHouse needs minimum of `logical` wal level and minimum `2` replication slots_ + add the following entries: + ```text + listen_addresses = '*' + max_replication_slots = 10 + wal_level = logical + ``` + _*ClickHouse needs minimum of `logical` wal level and minimum `2` replication slots_ 2. Using an admin account, create a user to connect from ClickHouse: -```sql -CREATE ROLE clickhouse_user SUPERUSER LOGIN PASSWORD 'ClickHouse_123'; -``` -_*for demonstration purposes, full superuser rights have been granted._ - + ```sql + CREATE ROLE clickhouse_user SUPERUSER LOGIN PASSWORD 'ClickHouse_123'; + ``` + _*for demonstration purposes, full superuser rights have been granted._ 3. create a new database: -```sql -CREATE DATABASE db1; -``` + ```sql + CREATE DATABASE db1; + ``` 4. connect to the new database in `psql`: -```text -\connect db1 -``` + ```text + \connect db1 + ``` 5. create a new table: -```sql -CREATE TABLE table1 ( + ```sql + CREATE TABLE table1 ( id integer primary key, column1 varchar(10) -); -``` + ); + ``` 6. add initial rows: -```sql -INSERT INTO table1 -(id, column1) -VALUES -(1, 'abc'), -(2, 'def'); -``` + ```sql + INSERT INTO table1 + (id, column1) + VALUES + (1, 'abc'), + (2, 'def'); + ``` 7. Configure PostgreSQL allow connections to the new database with the new user for replication. Below is the minimum entry to add to the `pg_hba.conf` file: -```text -# TYPE DATABASE USER ADDRESS METHOD -host db1 clickhouse_user 192.168.1.0/24 password -``` -_*for demonstration purposes, this is using clear text password authentication method. update the address line with either the subnet or the address of the server per PostgreSQL documentation_ + ```text + # TYPE DATABASE USER ADDRESS METHOD + host db1 clickhouse_user 192.168.1.0/24 password + ``` + _*for demonstration purposes, this is using clear text password authentication method. update the address line with either the subnet or the address of the server per PostgreSQL documentation_ 8. reload the `pg_hba.conf` configuration with something like this (adjust for your version): -```text -/usr/pgsql-12/bin/pg_ctl reload -``` + ```text + /usr/pgsql-12/bin/pg_ctl reload + ``` 9. Test the login with new `clickhouse_user`: -```text - psql -U clickhouse_user -W -d db1 -h -``` + ```text + psql -U clickhouse_user -W -d db1 -h + ``` ### 2. In ClickHouse {#2-in-clickhouse} 1. log into the ClickHouse CLI -```bash -clickhouse-client --user default --password ClickHouse123! -``` + ```bash + clickhouse-client --user default --password ClickHouse123! + ``` 2. Enable the PostgreSQL experimental feature for the database engine: -```sql -SET allow_experimental_database_materialized_postgresql=1 -``` + ```sql + SET allow_experimental_database_materialized_postgresql=1 + ``` 3. Create the new database to be replicated and define the initial table: -```sql -CREATE DATABASE db1_postgres -ENGINE = MaterializedPostgreSQL('postgres-host.domain.com:5432', 'db1', 'clickhouse_user', 'ClickHouse_123') -SETTINGS materialized_postgresql_tables_list = 'table1'; -``` -minimum options: - -|parameter|Description |example | -|---------|----------------------------|---------------------| -|host:port|hostname or IP and port |postgres-host.domain.com:5432| -|database |PostgreSQL database name |db1 | -|user |username to connect to postgres|clickhouse_user | -|password |password to connect to postgres|ClickHouse_123 | -|settings |additional settings for the engine| materialized_postgresql_tables_list = 'table1'| - -:::info -For complete guide to the PostgreSQL database engine, refer to https://clickhouse.com/docs/engines/database-engines/materialized-postgresql/#settings -::: + ```sql + CREATE DATABASE db1_postgres + ENGINE = MaterializedPostgreSQL('postgres-host.domain.com:5432', 'db1', 'clickhouse_user', 'ClickHouse_123') + SETTINGS materialized_postgresql_tables_list = 'table1'; + ``` + minimum options: + + |parameter|Description |example | + |---------|----------------------------|---------------------| + |host:port|hostname or IP and port |postgres-host.domain.com:5432| + |database |PostgreSQL database name |db1 | + |user |username to connect to postgres|clickhouse_user | + |password |password to connect to postgres|ClickHouse_123 | + |settings |additional settings for the engine| materialized_postgresql_tables_list = 'table1'| + + :::info + For complete guide to the PostgreSQL database engine, refer to https://clickhouse.com/docs/engines/database-engines/materialized-postgresql/#settings + ::: 4. Verify the initial table has data: -```sql -ch_env_2 :) select * from db1_postgres.table1; + ```sql + ch_env_2 :) select * from db1_postgres.table1; -SELECT * -FROM db1_postgres.table1 + SELECT * + FROM db1_postgres.table1 -Query id: df2381ac-4e30-4535-b22e-8be3894aaafc + Query id: df2381ac-4e30-4535-b22e-8be3894aaafc -┌─id─┬─column1─┐ -│ 1 │ abc │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 2 │ def │ -└────┴─────────┘ -``` + ┌─id─┬─column1─┐ + │ 1 │ abc │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 2 │ def │ + └────┴─────────┘ + ``` ### 3. Test basic replication {#3-test-basic-replication} 1. In PostgreSQL, add new rows: -```sql -INSERT INTO table1 -(id, column1) -VALUES -(3, 'ghi'), -(4, 'jkl'); -``` + ```sql + INSERT INTO table1 + (id, column1) + VALUES + (3, 'ghi'), + (4, 'jkl'); + ``` 2. In ClickHouse, verify the new rows are visible: -```sql -ch_env_2 :) select * from db1_postgres.table1; - -SELECT * -FROM db1_postgres.table1 - -Query id: b0729816-3917-44d3-8d1a-fed912fb59ce - -┌─id─┬─column1─┐ -│ 1 │ abc │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 4 │ jkl │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 3 │ ghi │ -└────┴─────────┘ -┌─id─┬─column1─┐ -│ 2 │ def │ -└────┴─────────┘ -``` + ```sql + ch_env_2 :) select * from db1_postgres.table1; + + SELECT * + FROM db1_postgres.table1 + + Query id: b0729816-3917-44d3-8d1a-fed912fb59ce + + ┌─id─┬─column1─┐ + │ 1 │ abc │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 4 │ jkl │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 3 │ ghi │ + └────┴─────────┘ + ┌─id─┬─column1─┐ + │ 2 │ def │ + └────┴─────────┘ + ``` ### 4. Summary {#4-summary} This integration guide focused on a simple example on how to replicate a database with a table, however, there exist more advanced options which include replicating the whole database or adding new tables and schemas to the existing replications. Although DDL commands are not supported for this replication, the engine can be set to detect changes and reload the tables when there are structural changes made. diff --git a/docs/integrations/data-ingestion/emqx/index.md b/docs/integrations/data-ingestion/emqx/index.md index 285abbad5ef..81bfd3d7f36 100644 --- a/docs/integrations/data-ingestion/emqx/index.md +++ b/docs/integrations/data-ingestion/emqx/index.md @@ -53,7 +53,6 @@ With the infrastructure provided by cloud providers, EMQX Cloud serves dozens of * You have prepared a Clickhouse Cloud instance to persist device data. * We are using [MQTT X](https://mqttx.app/) as an MQTT client testing tool to connect the deployment of EMQX Cloud to publish MQTT data. Or other methods connecting to the MQTT broker will do the job as well. - ## Get your ClickHouse Cloud service {#get-your-clickhouse-cloudservice} During this setup, we deployed the ClickHouse instance on AWS in N. Virginia (us-east -1), while an EMQX Cloud instance was also deployed in the same region. @@ -141,7 +140,6 @@ Go back to the Overview page and scroll down to the bottom of the page where you Once it has been created, you will find the public IP address in the widget. Please note that if you select "Connect from a specific location" during ClickHouse Cloud setup, you will need to add this IP address to the whitelist. - ## Integration EMQX Cloud with ClickHouse Cloud {#integration-emqx-cloud-with-clickhouse-cloud} The [EMQX Cloud Data Integrations](https://docs.emqx.com/en/cloud/latest/rule_engine/introduction.html#general-flow) is used to configure the rules for handling and responding to EMQX message flows and device events. The Data Integrations not only provides a clear and flexible "configurable" architecture solution, but also simplifies the development process, improves user usability, and reduces the coupling degree between the business system and EMQX Cloud. It also provides a superior infrastructure for customization of EMQX Cloud's proprietary capabilities. @@ -164,7 +162,7 @@ Click the ClickHouse card to create a new resource. - User: the username for connecting to your ClickHouse Cloud service. - Key: the password for the connection. - + ### Create a new rule {#create-a-new-rule} @@ -241,25 +239,25 @@ Click "New Connection" on MQTTX and fill the connection form: - Port: MQTT broker connection port. You can get it from the EMQX Cloud overview page. - Username/Password: Use the credential created above, which should be `emqx` and `xxxxxx` in this tutorial. - + -Click the "Connect" button on top right and the connection should be established. + Click the "Connect" button on top right and the connection should be established. -Now you can send messages to the MQTT broker using this tool. -Inputs: + Now you can send messages to the MQTT broker using this tool. + Inputs: 1. Set payload format to "JSON". 2. Set to topic: `temp_hum/emqx` (the topic we just set in the rule) 3. JSON body: -```bash -{"temp": 23.1, "hum": 0.68} -``` + ```bash + {"temp": 23.1, "hum": 0.68} + ``` -Click the send button on the right. You can change the temperature value and send more data to MQTT broker. + Click the send button on the right. You can change the temperature value and send more data to MQTT broker. -The data sent to EMQX Cloud should be processed by the rule engine and inserted into ClickHouse Cloud automatically. + The data sent to EMQX Cloud should be processed by the rule engine and inserted into ClickHouse Cloud automatically. - + ### View rules monitoring {#view-rules-monitoring} diff --git a/docs/integrations/data-ingestion/etl-tools/airbyte-and-clickhouse.md b/docs/integrations/data-ingestion/etl-tools/airbyte-and-clickhouse.md index 29d8a234c7a..9c0cbeb600a 100644 --- a/docs/integrations/data-ingestion/etl-tools/airbyte-and-clickhouse.md +++ b/docs/integrations/data-ingestion/etl-tools/airbyte-and-clickhouse.md @@ -29,10 +29,8 @@ Please note that the Airbyte source and destination for ClickHouse are currently Airbyte is an open-source data integration platform. It allows the creation of ELT data pipelines and is shipped with more than 140 out-of-the-box connectors. This step-by-step tutorial shows how to connect Airbyte to ClickHouse as a destination and load a sample dataset. - ## 1. Download and run Airbyte {#1-download-and-run-airbyte} - 1. Airbyte runs on Docker and uses `docker-compose`. Make sure to download and install the latest versions of Docker. 2. Deploy Airbyte by cloning the official Github repository and running `docker-compose up` in your favorite terminal: @@ -71,22 +69,20 @@ In this section, we will display how to add a ClickHouse instance as a destinati 4. Congratulations! you have now added ClickHouse as a destination in Airbyte. -:::note -In order to use ClickHouse as a destination, the user you'll use need to have the permissions to create databases, tables and insert rows. We recommend creating a dedicated user for Airbyte (eg. `my_airbyte_user`) with the following permissions: + :::note + In order to use ClickHouse as a destination, the user you'll use need to have the permissions to create databases, tables and insert rows. We recommend creating a dedicated user for Airbyte (eg. `my_airbyte_user`) with the following permissions: -```sql -CREATE USER 'my_airbyte_user'@'%' IDENTIFIED BY 'your_password_here'; - -GRANT CREATE ON * TO my_airbyte_user; -``` -::: + ```sql + CREATE USER 'my_airbyte_user'@'%' IDENTIFIED BY 'your_password_here'; + GRANT CREATE ON * TO my_airbyte_user; + ``` + ::: ## 3. Add a dataset as a source {#3-add-a-dataset-as-a-source} The example dataset we will use is the New York City Taxi Data (on Github). For this tutorial, we will use a subset of this dataset which corresponds to the month of Jan 2022. - 1. Within Airbyte, select the "Sources" page and add a new source of type file. @@ -101,28 +97,25 @@ The example dataset we will use is the - + 2. Select "Use existing source" and select the New York City Taxi Data, the select "Use existing destination" and select you ClickHouse instance. 3. Fill out the "Set up the connection" form by choosing a Replication Frequency (we will use `manual` for this tutorial) and select `nyc_taxi_2022` as the stream you want to sync. Make sure you pick `Normalized Tabular Data` as a Normalization. - + 4. Now that the connection is created, click on "Sync now" to trigger the data loading (since we picked `Manual` as a Replication Frequency) - + 5. Your data will start loading, you can expand the view to see Airbyte logs and progress. Once the operation finishes, you'll see a `Completed successfully` message in the logs: - + 6. Connect to your ClickHouse instance using your preferred SQL Client and check the resulting table: @@ -164,8 +157,6 @@ The example dataset we will use is the Airbyte official documentation. ```sql @@ -177,5 +168,4 @@ The example dataset we will use is the more details). - 8. Congratulations - you have successfully loaded the NYC taxi data into ClickHouse using Airbyte! diff --git a/docs/integrations/data-ingestion/etl-tools/apache-beam.md b/docs/integrations/data-ingestion/etl-tools/apache-beam.md index bd2ccbd2413..04662198f65 100644 --- a/docs/integrations/data-ingestion/etl-tools/apache-beam.md +++ b/docs/integrations/data-ingestion/etl-tools/apache-beam.md @@ -37,7 +37,6 @@ The `ClickHouseIO` connector is recommended for use starting from Apache Beam ve Earlier versions may not fully support the connector's functionality. ::: - The artifacts could be found in the [official maven repository](https://mvnrepository.com/artifact/org.apache.beam/beam-sdks-java-io-clickhouse). ### Code example {#code-example} @@ -58,10 +57,8 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; import org.joda.time.DateTime; - public class Main { - public static void main(String[] args) { // Create a Pipeline object. Pipeline p = Pipeline.create(); @@ -73,11 +70,9 @@ public class Main { .addField(Schema.Field.of("insertion_time", Schema.FieldType.DATETIME).withNullable(false)) .build(); - // Apply transforms to the pipeline. PCollection lines = p.apply("ReadLines", TextIO.read().from("src/main/resources/input.csv")); - PCollection rows = lines.apply("ConvertToRow", ParDo.of(new DoFn() { @ProcessElement public void processElement(@Element String line, OutputReceiver out) { @@ -148,7 +143,6 @@ Please consider the following limitations when using the connector: * ClickHouse performs deduplication when inserting into a `ReplicatedMergeTree` or a `Distributed` table built on top of a `ReplicatedMergeTree`. Without replication, inserting into a regular MergeTree can result in duplicates if an insert fails and then successfully retries. However, each block is inserted atomically, and the block size can be configured using `ClickHouseIO.Write.withMaxInsertBlockSize(long)`. Deduplication is achieved by using checksums of the inserted blocks. For more information about deduplication, please visit [Deduplication](/guides/developer/deduplication) and [Deduplicate insertion config](/operations/settings/settings#insert_deduplicate). * The connector doesn't perform any DDL statements; therefore, the target table must exist prior insertion. - ## Related content {#related-content} * `ClickHouseIO` class [documentation](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/clickhouse/ClickHouseIO.html). * `Github` repository of examples [clickhouse-beam-connector](https://github.com/ClickHouse/clickhouse-beam-connector). diff --git a/docs/integrations/data-ingestion/etl-tools/bladepipe-and-clickhouse.md b/docs/integrations/data-ingestion/etl-tools/bladepipe-and-clickhouse.md index 31a8f328a35..373601944fe 100644 --- a/docs/integrations/data-ingestion/etl-tools/bladepipe-and-clickhouse.md +++ b/docs/integrations/data-ingestion/etl-tools/bladepipe-and-clickhouse.md @@ -23,8 +23,7 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; - -BladePipe is a real-time end-to-end data integration tool with sub-second latency, boosting seamless data flow across platforms. +BladePipe is a real-time end-to-end data integration tool with sub-second latency, boosting seamless data flow across platforms. ClickHouse is one of BladePipe's pre-built connectors, allowing users to integrate data from various sources into ClickHouse automatically. This page will show how to load data into ClickHouse in real time step by step. @@ -39,82 +38,51 @@ Currently BladePipe supports for data integration to ClickHouse from the followi - OceanBase - TiDB -More sources are to be supported. - + More sources are to be supported. ## Download and run BladePipe {#1-run-bladepipe} 1. Log in to BladePipe Cloud. - 2. Follow the instructions in Install Worker (Docker) or Install Worker (Binary) to download and install a BladePipe Worker. - - :::note - Alternatively, you can download and deploy BladePipe Enterprise. - ::: - +:::note +Alternatively, you can download and deploy BladePipe Enterprise. +::: ## Add ClickHouse as a target {#2-add-clickhouse-as-a-target} - - :::note - 1. BladePipe supports ClickHouse version `20.12.3.3` or above. - 2. To use ClickHouse as a target, make sure that the user has SELECT, INSERT and common DDL permissions. - ::: - +:::note +1. BladePipe supports ClickHouse version `20.12.3.3` or above. +2. To use ClickHouse as a target, make sure that the user has SELECT, INSERT and common DDL permissions. +::: 1. In BladePipe, click "DataSource" > "Add DataSource". - 2. Select `ClickHouse`, and fill out the settings by providing your ClickHouse host and port, username and password, and click "Test Connection". - - - + 3. Click "Add DataSource" at the bottom, and a ClickHouse instance is added. - ## Add MySQL as a source {#3-add-mysql-as-a-source} In this tutorial, we use a MySQL instance as the source, and explain the process of loading MySQL data to ClickHouse. - :::note -To use MySQL as a source, make sure that the user has the required permissions. +To use MySQL as a source, make sure that the user has the required permissions. ::: - 1. In BladePipe, click "DataSource" > "Add DataSource". - 2. Select `MySQL`, and fill out the settings by providing your MySQL host and port, username and password, and click "Test Connection". - - - + 3. Click "Add DataSource" at the bottom, and a MySQL instance is added. - - ## Create a pipeline {#4-create-a-pipeline} - 1. In BladePipe, click "DataJob" > "Create DataJob". - 2. Select the added MySQL and ClickHouse instances and click "Test Connection" to ensure BladePipe is connected to the instances. Then, select the databases to be moved. - - + 3. Select "Incremental" for DataJob Type, together with the "Full Data" option. - - - + 4. Select the tables to be replicated. - - + 5. Select the columns to be replicated. - - - + 6. Confirm the DataJob creation, and the DataJob runs automatically. - - - + ## Verify the data {#5-verify-the-data} 1. Stop data write in MySQL instance and wait for ClickHouse to merge data. :::note Due to the unpredictable timing of ClickHouse's automatic merging, you can manually trigger a merging by running the `OPTIMIZE TABLE xxx FINAL;` command. Note that there is a chance that this manual merging may not always succeed. - Alternatively, you can run the `CREATE VIEW xxx_v AS SELECT * FROM xxx FINAL;` command to create a view and perform queries on the view to ensure the data is fully merged. ::: - 2. Create a Verification DataJob. Once the Verification DataJob is completed, review the results to confirm that the data in ClickHouse is the same as the data in MySQL. - - + - diff --git a/docs/integrations/data-ingestion/etl-tools/dbt/index.md b/docs/integrations/data-ingestion/etl-tools/dbt/index.md index a206c8f3814..4b3a2f1f084 100644 --- a/docs/integrations/data-ingestion/etl-tools/dbt/index.md +++ b/docs/integrations/data-ingestion/etl-tools/dbt/index.md @@ -40,12 +40,11 @@ dbt provides 4 types of materialization: * **ephemeral**: The model is not directly built in the database but is instead pulled into dependent models as common table expressions. * **incremental**: The model is initially materialized as a table, and in subsequent runs, dbt inserts new rows and updates changed rows in the table. -Additional syntax and clauses define how these models should be updated if their underlying data changes. dbt generally recommends starting with the view materialization until performance becomes a concern. The table materialization provides a query time performance improvement by capturing the results of the model's query as a table at the expense of increased storage. The incremental approach builds on this further to allow subsequent updates to the underlying data to be captured in the target table. + Additional syntax and clauses define how these models should be updated if their underlying data changes. dbt generally recommends starting with the view materialization until performance becomes a concern. The table materialization provides a query time performance improvement by capturing the results of the model's query as a table at the expense of increased storage. The incremental approach builds on this further to allow subsequent updates to the underlying data to be captured in the target table. -The[ current plugin](https://github.com/silentsokolov/dbt-clickhouse) for ClickHouse supports the **view**, **table,**, **ephemeral** and **incremental** materializations. The plugin also supports dbt[ snapshots](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots#check-strategy) and [seeds](https://docs.getdbt.com/docs/building-a-dbt-project/seeds) which we explore in this guide. - -For the following guides, we assume you have a ClickHouse instance available. + The[ current plugin](https://github.com/silentsokolov/dbt-clickhouse) for ClickHouse supports the **view**, **table,**, **ephemeral** and **incremental** materializations. The plugin also supports dbt[ snapshots](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots#check-strategy) and [seeds](https://docs.getdbt.com/docs/building-a-dbt-project/seeds) which we explore in this guide. + For the following guides, we assume you have a ClickHouse instance available. ## Setup of dbt and the ClickHouse plugin {#setup-of-dbt-and-the-clickhouse-plugin} @@ -73,14 +72,12 @@ pip install dbt-clickhouse dbt excels when modeling highly relational data. For the purposes of example, we provide a small IMDB dataset with the following relational schema. This dataset originates from the[ relational dataset repository](https://relational.fit.cvut.cz/dataset/IMDb). This is trivial relative to common schemas used with dbt but represents a manageable sample: - We use a subset of these tables as shown. Create the following tables: - ```sql CREATE DATABASE imdb; @@ -212,7 +209,6 @@ The response should look like: In the later guides, we will convert this query into a model - materializing it in ClickHouse as a dbt view and table. - ## Connecting to ClickHouse {#connecting-to-clickhouse} 1. Create a dbt project. In this case we name this after our `imdb` source. When prompted, select `clickhouse` as the database source. @@ -304,7 +300,6 @@ In the later guides, we will convert this query into a model - materializing it Confirm the response includes `Connection test: [OK connection ok]` indicating a successful connection. - ## Creating a simple view materialization {#creating-a-simple-view-materialization} When using the view materialization, a model is rebuilt as a view on each run, via a `CREATE VIEW AS` statement in ClickHouse. This doesn't require any additional storage of data but will be slower to query than table materializations. @@ -510,7 +505,6 @@ In the previous example, our model was materialized as a view. While this might SELECT * FROM imdb_dbt.actor_summary WHERE num_movies > 5 ORDER BY avg_rank DESC LIMIT 10; ``` - ## Creating an Incremental Materialization {#creating-an-incremental-materialization} The previous example created a table to materialize the model. This table will be reconstructed for each dbt execution. This may be infeasible and extremely costly for larger result sets or complex transformations. To address this challenge and reduce the build time, dbt offers Incremental materializations. This allows dbt to insert or update records into a table since the last execution, making it appropriate for event-style data. Under the hood a temporary table is created with all the updated records and then all the untouched records as well as the updated records are inserted into a new target table. This results in similar [limitations](#limitations) for large result sets as for the table model. @@ -698,17 +692,16 @@ AND event_time > subtractMinutes(now(), 15) ORDER BY event_time LIMIT 100; Adjust the above query to the period of execution. We leave result inspection to the user but highlight the general strategy used by the plugin to perform incremental updates: - 1. The plugin creates a temporary table `actor_sumary__dbt_tmp`. Rows that have changed are streamed into this table. 2. A new table, `actor_summary_new,` is created. The rows from the old table are, in turn, streamed from the old to new, with a check to make sure row ids do not exist in the temporary table. This effectively handles updates and duplicates. 3. The results from the temporary table are streamed into the new `actor_summary` table: 4. Finally, the new table is exchanged atomically with the old version via an `EXCHANGE TABLES` statement. The old and temporary tables are in turn dropped. -This is visualized below: + This is visualized below: - + -This strategy may encounter challenges on very large models. For further details see [Limitations](#limitations). + This strategy may encounter challenges on very large models. For further details see [Limitations](#limitations). ### Append Strategy (inserts-only mode) {#append-strategy-inserts-only-mode} @@ -719,73 +712,73 @@ To illustrate this mode, we will add another new actor and re-execute dbt run wi 1. Configure append only mode in actor_summary.sql: - ```sql - {{ config(order_by='(updated_at, id, name)', engine='MergeTree()', materialized='incremental', unique_key='id', incremental_strategy='append') }} - ``` + ```sql + {{ config(order_by='(updated_at, id, name)', engine='MergeTree()', materialized='incremental', unique_key='id', incremental_strategy='append') }} + ``` 2. Let's add another famous actor - Danny DeBito - ```sql - INSERT INTO imdb.actors VALUES (845467, 'Danny', 'DeBito', 'M'); - ``` + ```sql + INSERT INTO imdb.actors VALUES (845467, 'Danny', 'DeBito', 'M'); + ``` 3. Let's star Danny in 920 random movies. - ```sql - INSERT INTO imdb.roles - SELECT now() as created_at, 845467 as actor_id, id as movie_id, 'Himself' as role - FROM imdb.movies - LIMIT 920 OFFSET 10000; - ``` + ```sql + INSERT INTO imdb.roles + SELECT now() as created_at, 845467 as actor_id, id as movie_id, 'Himself' as role + FROM imdb.movies + LIMIT 920 OFFSET 10000; + ``` 4. Execute a dbt run and confirm that Danny was added to the actor-summary table - ```response - clickhouse-user@clickhouse:~/imdb$ dbt run - 16:12:16 Running with dbt=1.1.0 - 16:12:16 Found 1 model, 0 tests, 1 snapshot, 0 analyses, 186 macros, 0 operations, 0 seed files, 6 sources, 0 exposures, 0 metrics - 16:12:16 - 16:12:17 Concurrency: 1 threads (target='dev') - 16:12:17 - 16:12:17 1 of 1 START incremental model imdb_dbt.actor_summary........................... [RUN] - 16:12:24 1 of 1 OK created incremental model imdb_dbt.actor_summary...................... [OK in 0.17s] - 16:12:24 - 16:12:24 Finished running 1 incremental model in 0.19s. - 16:12:24 - 16:12:24 Completed successfully - 16:12:24 - 16:12:24 Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1 - ``` + ```response + clickhouse-user@clickhouse:~/imdb$ dbt run + 16:12:16 Running with dbt=1.1.0 + 16:12:16 Found 1 model, 0 tests, 1 snapshot, 0 analyses, 186 macros, 0 operations, 0 seed files, 6 sources, 0 exposures, 0 metrics + 16:12:16 + 16:12:17 Concurrency: 1 threads (target='dev') + 16:12:17 + 16:12:17 1 of 1 START incremental model imdb_dbt.actor_summary........................... [RUN] + 16:12:24 1 of 1 OK created incremental model imdb_dbt.actor_summary...................... [OK in 0.17s] + 16:12:24 + 16:12:24 Finished running 1 incremental model in 0.19s. + 16:12:24 + 16:12:24 Completed successfully + 16:12:24 + 16:12:24 Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1 + ``` - ```sql - SELECT * FROM imdb_dbt.actor_summary ORDER BY num_movies DESC LIMIT 3; - ``` + ```sql + SELECT * FROM imdb_dbt.actor_summary ORDER BY num_movies DESC LIMIT 3; + ``` - ```response - +------+-------------------+----------+------------------+------+---------+-------------------+ - |id |name |num_movies|avg_rank |genres|directors|updated_at | - +------+-------------------+----------+------------------+------+---------+-------------------+ - |845467|Danny DeBito |920 |1.4768987303293204|21 |670 |2022-04-26 16:22:06| - |845466|Clicky McClickHouse|910 |1.4687938697032283|21 |662 |2022-04-26 16:20:36| - |45332 |Mel Blanc |909 |5.7884792542982515|19 |148 |2022-04-26 16:17:42| - +------+-------------------+----------+------------------+------+---------+-------------------+ - ``` + ```response + +------+-------------------+----------+------------------+------+---------+-------------------+ + |id |name |num_movies|avg_rank |genres|directors|updated_at | + +------+-------------------+----------+------------------+------+---------+-------------------+ + |845467|Danny DeBito |920 |1.4768987303293204|21 |670 |2022-04-26 16:22:06| + |845466|Clicky McClickHouse|910 |1.4687938697032283|21 |662 |2022-04-26 16:20:36| + |45332 |Mel Blanc |909 |5.7884792542982515|19 |148 |2022-04-26 16:17:42| + +------+-------------------+----------+------------------+------+---------+-------------------+ + ``` -Note how much faster that incremental was compared to the insertion of "Clicky". + Note how much faster that incremental was compared to the insertion of "Clicky". -Checking again the query_log table reveals the differences between the 2 incremental runs: + Checking again the query_log table reveals the differences between the 2 incremental runs: - ```sql -INSERT INTO imdb_dbt.actor_summary ("id", "name", "num_movies", "avg_rank", "genres", "directors", "updated_at") -WITH actor_summary AS ( - SELECT id, + ```sql + INSERT INTO imdb_dbt.actor_summary ("id", "name", "num_movies", "avg_rank", "genres", "directors", "updated_at") + WITH actor_summary AS ( + SELECT id, any(actor_name) AS name, uniqExact(movie_id) AS num_movies, avg(rank) AS avg_rank, uniqExact(genre) AS genres, uniqExact(director_name) AS directors, max(created_at) AS updated_at - FROM ( + FROM ( SELECT imdb.actors.id AS id, concat(imdb.actors.first_name, ' ', imdb.actors.last_name) AS actor_name, imdb.movies.id AS movie_id, @@ -799,17 +792,17 @@ WITH actor_summary AS ( LEFT OUTER JOIN imdb.genres ON imdb.genres.movie_id = imdb.movies.id LEFT OUTER JOIN imdb.movie_directors ON imdb.movie_directors.movie_id = imdb.movies.id LEFT OUTER JOIN imdb.directors ON imdb.directors.id = imdb.movie_directors.director_id - ) - GROUP BY id -) + ) + GROUP BY id + ) -SELECT * -FROM actor_summary --- this filter will only be applied on an incremental run -WHERE id > (SELECT max(id) FROM imdb_dbt.actor_summary) OR updated_at > (SELECT max(updated_at) FROM imdb_dbt.actor_summary) - ``` + SELECT * + FROM actor_summary + -- this filter will only be applied on an incremental run + WHERE id > (SELECT max(id) FROM imdb_dbt.actor_summary) OR updated_at > (SELECT max(updated_at) FROM imdb_dbt.actor_summary) + ``` -In this run, only the new rows are added straight to `imdb_dbt.actor_summary` table and there is no table creation involved. + In this run, only the new rows are added straight to `imdb_dbt.actor_summary` table and there is no table creation involved. ### Delete and insert mode (experimental) {#deleteinsert-mode-experimental} @@ -831,9 +824,9 @@ In summary, this approach: 2. A `DELETE` is issued against the current `actor_summary` table. Rows are deleted by id from `actor_sumary__dbt_tmp` 3. The rows from `actor_sumary__dbt_tmp` are inserted into `actor_summary` using an `INSERT INTO actor_summary SELECT * FROM actor_sumary__dbt_tmp`. -This process is shown below: + This process is shown below: - + ### insert_overwrite mode (experimental) {#insert_overwrite-mode-experimental} Performs the following steps: @@ -842,15 +835,13 @@ Performs the following steps: 2. Insert only new records (produced by SELECT) into the staging table. 3. Replace only new partitions (present in the staging table) into the target table. -
- -This approach has the following advantages: + This approach has the following advantages: * It is faster than the default strategy because it doesn't copy the entire table. * It is safer than other strategies because it doesn't modify the original table until the INSERT operation completes successfully: in case of intermediate failure, the original table is not modified. * It implements "partitions immutability" data engineering best practice. Which simplifies incremental and parallel data processing, rollbacks, etc. - + ## Creating a snapshot {#creating-a-snapshot} @@ -921,7 +912,7 @@ This example assumes you have completed [Creating an Incremental Table Model](#c {% endsnapshot %} ``` -A few observations regarding this content: + A few observations regarding this content: * The select query defines the results you wish to snapshot over time. The function ref is used to reference our previously created actor_summary model. * We require a timestamp column to indicate record changes. Our updated_at column (see [Creating an Incremental Table Model](#creating-an-incremental-materialization)) can be used here. The parameter strategy indicates our use of a timestamp to denote updates, with the parameter updated_at specifying the column to use. If this is not present in your model you can alternatively use the [check strategy](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots#check-strategy). This is significantly more inefficient and requires the user to specify a list of columns to compare. dbt compares the current and historical values of these columns, recording any changes (or doing nothing if identical). @@ -944,7 +935,7 @@ A few observations regarding this content: 13:26:25 Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1 ``` -Note how a table actor_summary_snapshot has been created in the snapshots db (determined by the target_schema parameter). + Note how a table actor_summary_snapshot has been created in the snapshots db (determined by the target_schema parameter). 4. Sampling this data you will see how dbt has included the columns dbt_valid_from and dbt_valid_to. The latter has values set to null. Subsequent runs will update this. @@ -1005,7 +996,7 @@ Note how a table actor_summary_snapshot has been created in the snapshots db (de 13:46:31 Completed successfully 13:46:31 13:46:31 Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1 - ``` + ``` 7. If we now query our snapshot, notice we have 2 rows for Clicky McClickHouse. Our previous entry now has a dbt_valid_to value. Our new value is recorded with the same value in the dbt_valid_from column, and a dbt_valid_to value of null. If we did have new rows, these would also be appended to the snapshot. @@ -1025,8 +1016,7 @@ Note how a table actor_summary_snapshot has been created in the snapshots db (de +------+----------+------------+----------+-------------------+-------------------+ ``` -For further details on dbt snapshots see [here](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots). - + For further details on dbt snapshots see [here](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots). ## Using seeds {#using-seeds} @@ -1082,7 +1072,6 @@ dbt provides the ability to load data from CSV files. This capability is not sui +-------+----+= ``` - ## Limitations {#limitations} The current ClickHouse plugin for dbt has several limitations users should be aware of: @@ -1091,11 +1080,11 @@ The current ClickHouse plugin for dbt has several limitations users should be aw 2. To use Distributed tables to represent a model, users must create the underlying replicated tables on each node manually. The Distributed table can, in turn, be created on top of these. The plugin does not manage cluster creation. 3. When dbt creates a relation (table/view) in a database, it usually creates it as: `{{ database }}.{{ schema }}.{{ table/view id }}`. ClickHouse has no notion of schemas. The plugin therefore uses `{{schema}}.{{ table/view id }}`, where `schema` is the ClickHouse database. -Further Information + Further Information -The previous guides only touch the surface of dbt functionality. Users are recommended to read the excellent [dbt documentation](https://docs.getdbt.com/docs/introduction). + The previous guides only touch the surface of dbt functionality. Users are recommended to read the excellent [dbt documentation](https://docs.getdbt.com/docs/introduction). -Additional configuration for the plugin is described [here](https://github.com/silentsokolov/dbt-clickhouse#model-configuration). + Additional configuration for the plugin is described [here](https://github.com/silentsokolov/dbt-clickhouse#model-configuration). ## Fivetran {#fivetran} diff --git a/docs/integrations/data-ingestion/etl-tools/dlt-and-clickhouse.md b/docs/integrations/data-ingestion/etl-tools/dlt-and-clickhouse.md index 641f33f511c..5863d5edaf0 100644 --- a/docs/integrations/data-ingestion/etl-tools/dlt-and-clickhouse.md +++ b/docs/integrations/data-ingestion/etl-tools/dlt-and-clickhouse.md @@ -30,7 +30,6 @@ Start by initializing a new `dlt` project as follows: dlt init chess clickhouse ``` - :::note This command will initialize your pipeline with chess as the source and ClickHouse as the destination. ::: @@ -52,14 +51,13 @@ To load data into ClickHouse, you need to create a ClickHouse database. Here's a 3. Run the following SQL commands to create a new database, user and grant the necessary permissions: -```bash -CREATE DATABASE IF NOT EXISTS dlt; -CREATE USER dlt IDENTIFIED WITH sha256_password BY 'Dlt*12345789234567'; -GRANT CREATE, ALTER, SELECT, DELETE, DROP, TRUNCATE, OPTIMIZE, SHOW, INSERT, dictGet ON dlt.* TO dlt; -GRANT SELECT ON INFORMATION_SCHEMA.COLUMNS TO dlt; -GRANT CREATE TEMPORARY TABLE, S3 ON *.* TO dlt; -``` - + ```bash + CREATE DATABASE IF NOT EXISTS dlt; + CREATE USER dlt IDENTIFIED WITH sha256_password BY 'Dlt*12345789234567'; + GRANT CREATE, ALTER, SELECT, DELETE, DROP, TRUNCATE, OPTIMIZE, SHOW, INSERT, dictGet ON dlt.* TO dlt; + GRANT SELECT ON INFORMATION_SCHEMA.COLUMNS TO dlt; + GRANT CREATE TEMPORARY TABLE, S3 ON *.* TO dlt; + ``` ### 3. Add credentials {#3-add-credentials} @@ -79,7 +77,6 @@ secure = 1 # Set to 1 if using HTTPS, else 0. dataset_table_separator = "___" # Separator for dataset table names from dataset. ``` - :::note HTTP_PORT The `http_port` parameter specifies the port number to use when connecting to the ClickHouse server's HTTP interface. This is different from default port 9000, which is used for the native TCP protocol. @@ -96,7 +93,6 @@ You can pass a database connection string similar to the one used by the `clickh destination.clickhouse.credentials="clickhouse://dlt:Dlt*12345789234567@localhost:9000/dlt?secure=1" ``` - ## Write disposition {#write-disposition} All [write dispositions](https://dlthub.com/docs/general-usage/incremental-loading#choosing-a-write-disposition) @@ -125,11 +121,11 @@ Data is loaded into ClickHouse using the most efficient method depending on the - jsonl is the preferred format for both direct loading and staging. - parquet is supported for both direct loading and staging. -The `clickhouse` destination has a few specific deviations from the default sql destinations: + The `clickhouse` destination has a few specific deviations from the default sql destinations: 1. `Clickhouse` has an experimental `object` datatype, but we have found it to be a bit unpredictable, so the dlt clickhouse destination will load the complex datatype to a text column. If you need this feature, get in touch with our Slack community, and we will consider adding it. 2. `Clickhouse` does not support the `time` datatype. Time will be loaded to a `text` column. -3. `Clickhouse` does not support the `binary` datatype. Instead, binary data will be loaded into a `text` column. When loading from `jsonl`, the binary data will be a base64 string, and when loading from parquet, the `binary` object will be converted to `text`. +3. `Clickhouse` does not support the `binary` datatype. Instead, binary data will be loaded into a `text` column. When loading from `jsonl`, the binary data will be a base64 string, and when loading from parquet, the `binary` object will be converted to `text`. 5. `Clickhouse` accepts adding columns to a populated table that are not null. 6. `Clickhouse` can produce rounding errors under certain conditions when using the float or double datatype. If you cannot afford to have rounding errors, make sure to use the decimal datatype. For example, loading the value 12.7001 into a double column with the loader file format set to `jsonl` will predictably produce a rounding error. @@ -144,12 +140,10 @@ By default, tables are created using the `ReplicatedMergeTree` table engine in C ```bash from dlt.destinations.adapters import clickhouse_adapter - @dlt.resource() def my_resource(): ... - clickhouse_adapter(my_resource, table_engine_type="merge_tree") ``` @@ -170,16 +164,16 @@ Please refer to the filesystem documentation to learn how to configure credentia - Google Cloud Storage - Azure Blob Storage -To run a pipeline with staging enabled: + To run a pipeline with staging enabled: -```bash -pipeline = dlt.pipeline( - pipeline_name='chess_pipeline', - destination='clickhouse', - staging='filesystem', # add this to activate staging - dataset_name='chess_data' -) -``` + ```bash + pipeline = dlt.pipeline( + pipeline_name='chess_pipeline', + destination='clickhouse', + staging='filesystem', # add this to activate staging + dataset_name='chess_data' + ) + ``` ### Using Google Cloud Storage as a staging area {#using-google-cloud-storage-as-a-staging-area} dlt supports using Google Cloud Storage (GCS) as a staging area when loading data into ClickHouse. This is handled automatically by ClickHouse's GCS table function which dlt uses under the hood. @@ -192,31 +186,31 @@ To set up GCS staging with HMAC authentication in dlt: 2. Configure the HMAC keys as well as the `client_email`, `project_id` and `private_key` for your service account in your dlt project's ClickHouse destination settings in `config.toml`: -```bash -[destination.filesystem] -bucket_url = "gs://dlt-ci" + ```bash + [destination.filesystem] + bucket_url = "gs://dlt-ci" -[destination.filesystem.credentials] -project_id = "a-cool-project" -client_email = "my-service-account@a-cool-project.iam.gserviceaccount.com" -private_key = "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkaslkdjflasjnkdcopauihj...wEiEx7y+mx\nNffxQBqVVej2n/D93xY99pM=\n-----END PRIVATE KEY-----\n" + [destination.filesystem.credentials] + project_id = "a-cool-project" + client_email = "my-service-account@a-cool-project.iam.gserviceaccount.com" + private_key = "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkaslkdjflasjnkdcopauihj...wEiEx7y+mx\nNffxQBqVVej2n/D93xY99pM=\n-----END PRIVATE KEY-----\n" -[destination.clickhouse.credentials] -database = "dlt" -username = "dlt" -password = "Dlt*12345789234567" -host = "localhost" -port = 9440 -secure = 1 -gcp_access_key_id = "JFJ$$*f2058024835jFffsadf" -gcp_secret_access_key = "DFJdwslf2hf57)%$02jaflsedjfasoi" -``` + [destination.clickhouse.credentials] + database = "dlt" + username = "dlt" + password = "Dlt*12345789234567" + host = "localhost" + port = 9440 + secure = 1 + gcp_access_key_id = "JFJ$$*f2058024835jFffsadf" + gcp_secret_access_key = "DFJdwslf2hf57)%$02jaflsedjfasoi" + ``` -Note: In addition to the HMAC keys `bashgcp_access_key_id` and `gcp_secret_access_key`), you now need to provide the `client_email`, `project_id` and `private_key` for your service account under `[destination.filesystem.credentials]`. This is because the GCS staging support is now implemented as a temporary workaround and is still unoptimized. + Note: In addition to the HMAC keys `bashgcp_access_key_id` and `gcp_secret_access_key`), you now need to provide the `client_email`, `project_id` and `private_key` for your service account under `[destination.filesystem.credentials]`. This is because the GCS staging support is now implemented as a temporary workaround and is still unoptimized. -dlt will pass these credentials to ClickHouse which will handle the authentication and GCS access. + dlt will pass these credentials to ClickHouse which will handle the authentication and GCS access. -There is active work in progress to simplify and improve the GCS staging setup for the ClickHouse dlt destination in the future. Proper GCS staging support is being tracked in these GitHub issues: + There is active work in progress to simplify and improve the GCS staging setup for the ClickHouse dlt destination in the future. Proper GCS staging support is being tracked in these GitHub issues: - Make filesystem destination work with gcs in s3 compatibility mode - Google Cloud Storage staging area support diff --git a/docs/integrations/data-ingestion/etl-tools/nifi-and-clickhouse.md b/docs/integrations/data-ingestion/etl-tools/nifi-and-clickhouse.md index 57e294e9587..78a6a32cfc1 100644 --- a/docs/integrations/data-ingestion/etl-tools/nifi-and-clickhouse.md +++ b/docs/integrations/data-ingestion/etl-tools/nifi-and-clickhouse.md @@ -39,7 +39,6 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; 1. For a new setup, download the binary from https://nifi.apache.org/download.html and start by running `./bin/nifi.sh start` - ## 3. Download the ClickHouse JDBC driver {#3-download-the-clickhouse-jdbc-driver} 1. Visit the ClickHouse JDBC driver release page on GitHub and look for the latest JDBC release version @@ -66,13 +65,13 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; 5. Under the "Properties" section, input the following values - | Property | Value | Remark | - | --------------------------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------- | - | Database Connection URL | jdbc:ch:https://HOSTNAME:8443/default?ssl=true | Replace HOSTNAME in the connection URL accordingly | - | Database Driver Class Name | com.clickhouse.jdbc.ClickHouseDriver || - | Database Driver Location(s) | /etc/nifi/nifi-X.XX.X/lib/clickhouse-jdbc-0.X.X-patchXX-shaded.jar | Absolute path to the ClickHouse JDBC driver JAR file | - | Database User | default | ClickHouse username | - | Password | password | ClickHouse password | + | Property | Value | Remark | + | --------------------------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------- | + | Database Connection URL | jdbc:ch:https://HOSTNAME:8443/default?ssl=true | Replace HOSTNAME in the connection URL accordingly | + | Database Driver Class Name | com.clickhouse.jdbc.ClickHouseDriver || + | Database Driver Location(s) | /etc/nifi/nifi-X.XX.X/lib/clickhouse-jdbc-0.X.X-patchXX-shaded.jar | Absolute path to the ClickHouse JDBC driver JAR file | + | Database User | default | ClickHouse username | + | Password | password | ClickHouse password | 6. In the Settings section, change the name of the Controller Service to "ClickHouse JDBC" for easy reference @@ -82,8 +81,6 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; -
- 8. Check the Controller Services tab and ensure that the Controller Service is enabled diff --git a/docs/integrations/data-ingestion/etl-tools/vector-to-clickhouse.md b/docs/integrations/data-ingestion/etl-tools/vector-to-clickhouse.md index b0accc603c8..87a207e96d0 100644 --- a/docs/integrations/data-ingestion/etl-tools/vector-to-clickhouse.md +++ b/docs/integrations/data-ingestion/etl-tools/vector-to-clickhouse.md @@ -41,12 +41,10 @@ Let's define a table to store the log events: There is not really a need for a primary key yet, so that is why **ORDER BY** is set to **tuple()**. ::: - ## 2. Configure Nginx {#2--configure-nginx} We certainly do not want to spend too much time explaining Nginx, but we also do not want to hide all the details, so in this step we will provide you with enough details to get Nginx logging configured. - 1. The following `access_log` property sends logs to `/var/log/nginx/my_access.log` in the **combined** format. This value goes in the `http` section of your `nginx.conf` file: ```bash http { @@ -72,7 +70,6 @@ We certainly do not want to spend too much time explaining Nginx, but we also do Vector collects, transforms and routes logs, metrics, and traces (referred to as **sources**) to lots of different vendors (referred to as **sinks**), including out-of-the-box compatibility with ClickHouse. Sources and sinks are defined in a configuration file named **vector.toml**. - 1. The following **vector.toml** defines a **source** of type **file** that tails the end of **my_access.log**, and it also defines a **sink** as the **access_logs** table defined above: ```bash [sources.nginx_logs] @@ -97,12 +94,10 @@ Vector collects, transforms and routes logs, metrics, and traces (referred to as ``` - ## 4. Parse the Logs {#4-parse-the-logs} Having the logs in ClickHouse is great, but storing each event as a single string does not allow for much data analysis. Let's see how to parse the log events using a materialized view. - 1. A **materialized view** (MV, for short) is a new table based on an existing table, and when inserts are made to the existing table, the new data is also added to the materialized view. Let's see how to define a MV that contains a parsed representation of the log events in **access_logs**, in other words: ```bash 192.168.208.1 - - [12/Oct/2021:15:32:43 +0000] "GET / HTTP/1.1" 304 0 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36" @@ -184,5 +179,4 @@ Having the logs in ClickHouse is great, but storing each event as a single strin The lesson above stored the data in two tables, but you could change the initial `nginxdb.access_logs` table to use the **Null** table engine - the parsed data will still end up in the `nginxdb.access_logs_view` table, but the raw data will not be stored in a table. ::: - -**Summary:** By using Vector, which only required a simple install and quick configuration, we can send logs from an Nginx server to a table in ClickHouse. By using a clever materialized view, we can parse those logs into columns for easier analytics. + **Summary:** By using Vector, which only required a simple install and quick configuration, we can send logs from an Nginx server to a table in ClickHouse. By using a clever materialized view, we can parse those logs into columns for easier analytics. diff --git a/docs/integrations/data-ingestion/gcs/index.md b/docs/integrations/data-ingestion/gcs/index.md index 04c5110c387..6979225114b 100644 --- a/docs/integrations/data-ingestion/gcs/index.md +++ b/docs/integrations/data-ingestion/gcs/index.md @@ -34,8 +34,8 @@ This part of the configuration is shown in the highlighted section and specifies - The service account HMAC key and secret - The metadata path on the local disk -```xml - + ```xml + @@ -59,8 +59,8 @@ This part of the configuration is shown in the highlighted section and specifies - -``` + + ``` #### Storage configuration > disks > cache {#storage_configuration--disks--cache} The example configuration highlighted below enables a 10Gi memory cache for the disk `gcs`. @@ -174,14 +174,12 @@ SELECT passenger_count, avg(tip_amount) AS avg_tip, avg(total_amount) AS avg_amo Replication with GCS disks can be accomplished by using the `ReplicatedMergeTree` table engine. See the [replicating a single shard across two GCP regions using GCS](#gcs-multi-region) guide for details. - ### Learn more {#learn-more} The [Cloud Storage XML API](https://cloud.google.com/storage/docs/xml-api/overview) is interoperable with some tools and libraries that work with services such as Amazon Simple Storage Service (Amazon S3). For further information on tuning threads, see [Optimizing for Performance](../s3/index.md#s3-optimizing-performance). - ## Using Google Cloud Storage (GCS) {#gcs-multi-region} :::tip @@ -199,7 +197,7 @@ Sample requirements for high availability: - Two GCS buckets, deployed in the same regions as the two ClickHouse server nodes - Three ClickHouse Keeper nodes, two of them are deployed in the same regions as the ClickHouse server nodes. The third can be in the same region as one of the first two Keeper nodes, but in a different availability zone. -ClickHouse Keeper requires two nodes to function, hence a requirement for three nodes for high availability. + ClickHouse Keeper requires two nodes to function, hence a requirement for three nodes for high availability. ### Prepare virtual machines {#prepare-vms} @@ -249,8 +247,8 @@ All of the ClickHouse Keeper nodes have the same configuration file except for t - Copy the file into place (`/etc/clickhouse-keeper/keeper_config.xml` on each of the Keeper servers - Edit the `server_id` on each machine, based on its entry number in the `raft_configuration` -```xml title=/etc/clickhouse-keeper/keeper_config.xml - + ```xml title=/etc/clickhouse-keeper/keeper_config.xml + trace /var/log/clickhouse-keeper/clickhouse-keeper.log @@ -261,7 +259,7 @@ All of the ClickHouse Keeper nodes have the same configuration file except for t 0.0.0.0 9181 - + 3 /var/lib/clickhouse/coordination/log /var/lib/clickhouse/coordination/snapshots @@ -283,17 +281,17 @@ All of the ClickHouse Keeper nodes have the same configuration file except for t keepernode2.us-east4-c.c.clickhousegcs-374921.internal 9234 - + 3 keepernode3.us-east5-a.c.clickhousegcs-374921.internal 9234 - + - -``` + + ``` ### Configure ClickHouse server {#configure-clickhouse-server} @@ -316,9 +314,8 @@ Replication is coordinated by ClickHouse Keeper. This configuration file identi - Edit the hostnames to match your Keeper hosts - -```xml title=/etc/clickhouse-server/config.d/use-keeper.xml - + ```xml title=/etc/clickhouse-server/config.d/use-keeper.xml + keepernode1.us-east1-b.c.clickhousegcs-374921.internal @@ -333,9 +330,8 @@ Replication is coordinated by ClickHouse Keeper. This configuration file identi 9181 - -``` - + + ``` #### Remote ClickHouse servers {#remote-clickhouse-servers} @@ -343,8 +339,8 @@ This file configures the hostname and port of each ClickHouse server in the clus - Edit the file with your hostnames, and make sure that they resolve from the ClickHouse server nodes -```xml title=/etc/clickhouse-server/config.d/remote-servers.xml - + ```xml title=/etc/clickhouse-server/config.d/remote-servers.xml + @@ -359,8 +355,8 @@ This file configures the hostname and port of each ClickHouse server in the clus - -``` + + ``` #### Replica identification {#replica-identification} @@ -390,12 +386,12 @@ These substitutions differ between the two ClickHouse server nodes: - `REPLICA 1 BUCKET` should be set to the name of the bucket in the same region as the server - `REPLICA 1 FOLDER` should be changed to `replica_1` on one of the servers, and `replica_2` on the other -These substitutions are common across the two nodes: + These substitutions are common across the two nodes: - The `access_key_id` should be set to the HMAC Key generated earlier - The `secret_access_key` should be set to HMAC Secret generated earlier -```xml title=/etc/clickhouse-server/config.d/storage.xml - + ```xml title=/etc/clickhouse-server/config.d/storage.xml + @@ -423,8 +419,8 @@ These substitutions are common across the two nodes: - -``` + + ``` ### Start ClickHouse Keeper {#start-clickhouse-keeper} @@ -489,62 +485,62 @@ sudo service clickhouse-server status - gcs - cache -```sql -SELECT * -FROM system.disks -FORMAT Vertical -``` -```response -Row 1: -────── -name: cache -path: /var/lib/clickhouse/disks/gcs/ -free_space: 18446744073709551615 -total_space: 18446744073709551615 -unreserved_space: 18446744073709551615 -keep_free_space: 0 -type: s3 -is_encrypted: 0 -is_read_only: 0 -is_write_once: 0 -is_remote: 1 -is_broken: 0 -cache_path: /var/lib/clickhouse/disks/gcs_cache/ - -Row 2: -────── -name: default -path: /var/lib/clickhouse/ -free_space: 6555529216 -total_space: 10331889664 -unreserved_space: 6555529216 -keep_free_space: 0 -type: local -is_encrypted: 0 -is_read_only: 0 -is_write_once: 0 -is_remote: 0 -is_broken: 0 -cache_path: - -Row 3: -────── -name: gcs -path: /var/lib/clickhouse/disks/gcs/ -free_space: 18446744073709551615 -total_space: 18446744073709551615 -unreserved_space: 18446744073709551615 -keep_free_space: 0 -type: s3 -is_encrypted: 0 -is_read_only: 0 -is_write_once: 0 -is_remote: 1 -is_broken: 0 -cache_path: - -3 rows in set. Elapsed: 0.002 sec. -``` + ```sql + SELECT * + FROM system.disks + FORMAT Vertical + ``` + ```response + Row 1: + ────── + name: cache + path: /var/lib/clickhouse/disks/gcs/ + free_space: 18446744073709551615 + total_space: 18446744073709551615 + unreserved_space: 18446744073709551615 + keep_free_space: 0 + type: s3 + is_encrypted: 0 + is_read_only: 0 + is_write_once: 0 + is_remote: 1 + is_broken: 0 + cache_path: /var/lib/clickhouse/disks/gcs_cache/ + + Row 2: + ────── + name: default + path: /var/lib/clickhouse/ + free_space: 6555529216 + total_space: 10331889664 + unreserved_space: 6555529216 + keep_free_space: 0 + type: local + is_encrypted: 0 + is_read_only: 0 + is_write_once: 0 + is_remote: 0 + is_broken: 0 + cache_path: + + Row 3: + ────── + name: gcs + path: /var/lib/clickhouse/disks/gcs/ + free_space: 18446744073709551615 + total_space: 18446744073709551615 + unreserved_space: 18446744073709551615 + keep_free_space: 0 + type: s3 + is_encrypted: 0 + is_read_only: 0 + is_write_once: 0 + is_remote: 1 + is_broken: 0 + cache_path: + + 3 rows in set. Elapsed: 0.002 sec. + ``` #### Verify that tables created on the cluster are created on both nodes {#verify-that-tables-created-on-the-cluster-are-created-on-both-nodes} ```sql -- highlight-next-line diff --git a/docs/integrations/data-ingestion/google-dataflow/dataflow.md b/docs/integrations/data-ingestion/google-dataflow/dataflow.md index f982e6c1618..18a1ce48931 100644 --- a/docs/integrations/data-ingestion/google-dataflow/dataflow.md +++ b/docs/integrations/data-ingestion/google-dataflow/dataflow.md @@ -33,4 +33,4 @@ ClickHouse offers [predefined templates](./templates) designed for specific use - Quick and easy setup for simple use cases. - Suitable also for users with minimal programming expertise. -Both approaches are fully compatible with Google Cloud and the ClickHouse ecosystem, offering flexibility depending on your technical expertise and project requirements. + Both approaches are fully compatible with Google Cloud and the ClickHouse ecosystem, offering flexibility depending on your technical expertise and project requirements. diff --git a/docs/integrations/data-ingestion/google-dataflow/java-runner.md b/docs/integrations/data-ingestion/google-dataflow/java-runner.md index fe6abe1e201..536da0d2b8b 100644 --- a/docs/integrations/data-ingestion/google-dataflow/java-runner.md +++ b/docs/integrations/data-ingestion/google-dataflow/java-runner.md @@ -17,9 +17,9 @@ The Dataflow Java Runner lets you execute custom Apache Beam pipelines on Google ## How it works {#how-it-works} 1. **Pipeline Implementation** - To use the Java Runner, you need to implement your Beam pipeline using the `ClickHouseIO` - our official Apache Beam connector. For code examples and instructions on how to use the `ClickHouseIO`, please visit [ClickHouse Apache Beam](/integrations/apache-beam). + To use the Java Runner, you need to implement your Beam pipeline using the `ClickHouseIO` - our official Apache Beam connector. For code examples and instructions on how to use the `ClickHouseIO`, please visit [ClickHouse Apache Beam](/integrations/apache-beam). 2. **Deployment** - Once your pipeline is implemented and configured, you can deploy it to Dataflow using Google Cloud's deployment tools. Comprehensive deployment instructions are provided in the [Google Cloud Dataflow documentation - Java Pipeline](https://cloud.google.com/dataflow/docs/quickstarts/create-pipeline-java). + Once your pipeline is implemented and configured, you can deploy it to Dataflow using Google Cloud's deployment tools. Comprehensive deployment instructions are provided in the [Google Cloud Dataflow documentation - Java Pipeline](https://cloud.google.com/dataflow/docs/quickstarts/create-pipeline-java). -**Note**: This approach assumes familiarity with the Beam framework and coding expertise. If you prefer a no-code solution, consider using [ClickHouse's predefined templates](./templates). + **Note**: This approach assumes familiarity with the Beam framework and coding expertise. If you prefer a no-code solution, consider using [ClickHouse's predefined templates](./templates). diff --git a/docs/integrations/data-ingestion/google-dataflow/templates.md b/docs/integrations/data-ingestion/google-dataflow/templates.md index f2006282b3c..a7bcfb0b87e 100644 --- a/docs/integrations/data-ingestion/google-dataflow/templates.md +++ b/docs/integrations/data-ingestion/google-dataflow/templates.md @@ -25,7 +25,6 @@ Google Dataflow templates provide a convenient way to execute prebuilt, ready-to As of today, the ClickHouse official template is available via the Google Cloud Console, CLI or Dataflow REST API. For detailed step-by-step instructions, refer to the [Google Dataflow Run Pipeline From a Template Guide](https://cloud.google.com/dataflow/docs/templates/provided-templates). - ## List of ClickHouse Templates {#list-of-clickhouse-templates} * [BigQuery To ClickHouse](./templates/bigquery-to-clickhouse) * [GCS To ClickHouse](https://github.com/ClickHouse/DataflowTemplates/issues/3) (coming soon!) diff --git a/docs/integrations/data-ingestion/google-dataflow/templates/bigquery-to-clickhouse.md b/docs/integrations/data-ingestion/google-dataflow/templates/bigquery-to-clickhouse.md index 7b8040d38da..756e71d722b 100644 --- a/docs/integrations/data-ingestion/google-dataflow/templates/bigquery-to-clickhouse.md +++ b/docs/integrations/data-ingestion/google-dataflow/templates/bigquery-to-clickhouse.md @@ -53,7 +53,6 @@ The template can read the entire table or filter specific records using a provid | `queryTempDataset` | Set an existing dataset to create the temporary table to store the results of the query. For example, `temp_dataset`. | | | | `KMSEncryptionKey` | If reading from BigQuery using the query source, use this Cloud KMS key to encrypt any temporary tables created. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. | | | - :::note Default values for all `ClickHouseIO` parameters can be found in [`ClickHouseIO` Apache Beam Connector](/integrations/apache-beam#clickhouseiowrite-parameters) ::: @@ -65,12 +64,10 @@ To effectively load the BigQuery dataset into ClickHouse, the pipeline performs 1. The templates build a schema object based on the target ClickHouse table. 2. The templates iterate over the BigQuery dataset, and attempts to match columns based on their names. -
- -:::important -Having said that, your BigQuery dataset (either table or query) must have the exact same column names as your ClickHouse -target table. -::: + :::important + Having said that, your BigQuery dataset (either table or query) must have the exact same column names as your ClickHouse + target table. + ::: ## Data type mapping {#data-types-mapping} @@ -98,78 +95,59 @@ requirements and prerequisites. ::: - - Sign in to your Google Cloud Console and search for DataFlow. - + +Sign in to your Google Cloud Console and search for DataFlow. 1. Press the `CREATE JOB FROM TEMPLATE` button - + 2. Once the template form is open, enter a job name and select the desired region. - + 3. In the `DataFlow Template` input, type `ClickHouse` or `BigQuery`, and select the `BigQuery to ClickHouse` template - + 4. Once selected, the form will expand to allow you to provide additional details: - * The ClickHouse server JDBC url, with the following format `jdbc:clickhouse://host:port/schema`. - * The ClickHouse username. - * The ClickHouse target table name. - -
- +* The ClickHouse server JDBC url, with the following format `jdbc:clickhouse://host:port/schema`. +* The ClickHouse username. +* The ClickHouse target table name. :::note The ClickHouse password option is marked as optional, for use cases where there is no password configured. To add it, please scroll down to the `Password for ClickHouse Endpoint` option. ::: - - 5. Customize and add any BigQuery/ClickHouseIO related configurations, as detailed in - the [Template Parameters](#template-parameters) section - -
- - +the [Template Parameters](#template-parameters) section + + ### Install & Configure `gcloud` CLI {#install--configure-gcloud-cli} - - If not already installed, install the [`gcloud` CLI](https://cloud.google.com/sdk/docs/install). - Follow the `Before you begin` section - in [this guide](https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates#before-you-begin) to set - up the required configurations, settings, and permissions for running the DataFlow template. - +in [this guide](https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates#before-you-begin) to set +up the required configurations, settings, and permissions for running the DataFlow template. ### Run command {#run-command} - Use the [`gcloud dataflow flex-template run`](https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/run) command to run a Dataflow job that uses the Flex Template. - Below is an example of the command: - ```bash gcloud dataflow flex-template run "bigquery-clickhouse-dataflow-$(date +%Y%m%d-%H%M%S)" \ - --template-file-gcs-location "gs://clickhouse-dataflow-templates/bigquery-clickhouse-metadata.json" \ - --parameters inputTableSpec="",jdbcUrl="jdbc:clickhouse://:/?ssl=true&sslmode=NONE",clickHouseUsername="",clickHousePassword="",clickHouseTable="" +--template-file-gcs-location "gs://clickhouse-dataflow-templates/bigquery-clickhouse-metadata.json" \ +--parameters inputTableSpec="",jdbcUrl="jdbc:clickhouse://:/?ssl=true&sslmode=NONE",clickHouseUsername="",clickHousePassword="",clickHouseTable="" ``` - ### Command breakdown {#command-breakdown} - - **Job Name:** The text following the `run` keyword is the unique job name. - **Template File:** The JSON file specified by `--template-file-gcs-location` defines the template structure and - details about the accepted parameters. The mention file path is public and ready to use. +details about the accepted parameters. The mention file path is public and ready to use. - **Parameters:** Parameters are separated by commas. For string-based parameters, enclose the values in double quotes. - ### Expected response {#expected-response} - After running the command, you should see a response similar to the following: - ```bash job: - createTime: '2025-01-26T14:34:04.608442Z' - currentStateTime: '1970-01-01T00:00:00Z' - id: 2025-01-26_06_34_03-13881126003586053150 - location: us-central1 - name: bigquery-clickhouse-dataflow-20250126-153400 - projectId: ch-integrations - startTime: '2025-01-26T14:34:04.608442Z' +createTime: '2025-01-26T14:34:04.608442Z' +currentStateTime: '1970-01-01T00:00:00Z' +id: 2025-01-26_06_34_03-13881126003586053150 +location: us-central1 +name: bigquery-clickhouse-dataflow-20250126-153400 +projectId: ch-integrations +startTime: '2025-01-26T14:34:04.608442Z' ``` - - +
### Monitor the job {#monitor-the-job} diff --git a/docs/integrations/data-ingestion/insert-local-files.md b/docs/integrations/data-ingestion/insert-local-files.md index 778afef7ca9..92fb801c24c 100644 --- a/docs/integrations/data-ingestion/insert-local-files.md +++ b/docs/integrations/data-ingestion/insert-local-files.md @@ -14,21 +14,21 @@ the data using the many powerful and convenient ClickHouse functions. Let's look 1. Suppose we have a TSV file named `comments.tsv` that contains some Hacker News comments, and the header row contains column names. You need to specify an [input format](/interfaces/formats) when you insert the data, which in our case is `TabSeparatedWithNames`: -```text -id type author timestamp comment children -19464423 comment adrianmonk 2019-03-22 16:58:19 "It's an apples and oranges comparison in the first place. There are security expenses related to prison populations. You need staff, facilities, equipment, etc. to manage prisoners behavior (prevent fights, etc.) and keep them from escaping. The two things have a different mission, so of course they're going to have different costs.

It's like saying a refrigerator is more expensive than a microwave. It doesn't mean anything because they do different things." [] -19464461 comment sneakernets 2019-03-22 17:01:10 "Because the science is so solid that it's beating a dead horse at this point.

But with anti-vaxxers, It's like telling someone the red apple you're holding is red, yet they insist that it's green. You can't argue "the merits" with people like this." [19464582] -19465288 comment derefr 2019-03-22 18:15:21 "Because we're talking about the backend-deployment+ops-jargon terms "website" and "webapp", not their general usage. Words can have precise jargon meanings which are different in different disciplines. This is where ops people tend to draw the line: a website is something you can deploy to e.g. an S3 bucket and it'll be fully functional, with no other dependencies that you have to maintain for it. A webapp is something that does have such dependencies that you need to set up and maintain—e.g. a database layer.

But even ignoring that, I also define the terms this way because of the prefix "web." A webapp isn't "an app on the web", but rather "an app powered by the web." An entirely-offline JavaScript SPA that is just served over the web, isn't a web-app. It's just a program that runs in a browser, just like a Flash or ActiveX or Java applet is a program that runs in a browser. (Is a Flash game a "web game"? It's usually considered a browser game, but that's not the same thing.)

We already have a term for the thing that {Flash, ActiveX, Java} applets are: apps. Offline JavaScript SPAs are just apps too. We don't need to add the prefix "web"; it's meaningless here. In any of those cases, if you took the exact same program, and slammed it into an Electron wrapper instead of into a domain-fronted S3 bucket, it would clearly not be a "web app" in any sense. Your SPA would just be "a JavaScript app that uses a browser DOM as its graphics toolkit." Well, that's just as true before you put it in the Electron wrapper.

So "web app", then, has a specific meaning, above and beyond "app." You need something extra. That something extra is a backend, which your browser—driven by the app's logic—interacts with over the web. That's what makes an app "a web app." (This definition intentionally encompasses both server-rendered dynamic HTML, and client-rendered JavaScript SPA apps. You don't need a frontend app; you just need a web backend that something is interacting with. That something can be the browser directly, by clicking links and submitting forms; or it can be a JavaScript frontend, using AJAX.)

A "web site", then, is a "web app" without the "app" part. If it's clear in the above definition what an "app" is, and what a "web app" is, then you can subtract one from the other to derive a definition of a "web not-app." That's a website: something powered by a web backend, which does not do any app things. If we decide that "app things" are basically "storing state", then a "site" is an "app" with no persistent state.

And since the definition of "web" here is about a backend, then the difference between a "web app" and a "web site" (a web not-app) is probably defined by the properties of the backend. So the difference about the ability of the web backend to store state. So a "web site" is a "web app" where the backend does no app things—i.e., stores no state." [] -19465534 comment bduerst 2019-03-22 18:36:40 "Apple included: https://www.theguardian.com/commentisfree/2018/mar/04/apple-..." [] -19466269 comment CalChris 2019-03-22 19:55:13 "> It has the same A12 CPU ... with 3 GB of RAM on the system-on-a-chip

Actually that's package-on-package. The LPDDR4X DRAM is glued (well, reflow soldered) to the back of the A12 Bionic.

https://www.techinsights.com/about-techinsights/overview/blo...

https://en.wikipedia.org/wiki/Package_on_package" [19468341] -19466980 comment onetimemanytime 2019-03-22 21:07:25 ">>The insanity, here, is that you can't take the land the motorhome is on and build a studio on it.

apple and oranges. The permit to built the studio makes that building legit, kinda forever. A motor home, they can chase out with a new law, or just by enforcing existing laws." [] -19467048 comment karambahh 2019-03-22 21:15:41 "I think you're comparing apples to oranges here.

If you reclaim a parking space for another use (such as building accommodation for families or an animal shelter), you're not depriving the car of anything, it's an expensive, large piece of metal and is not sentient.

Next, you'll say that you're depriving car owners from the practicality of parking their vehicles anywhere they like. I'm perfectly fine with depriving car owners from this convenience to allow a human being to have a roof over their head. (speaking from direct experience as I've just minutes ago had to park my car 1km away from home because the city is currently building housing and has restricted parking space nearby)

Then, some might argue that one should be ashamed of helping animals while humans are suffering. That's the exact same train of thought with «we can't allow more migrants in, we have to take care of our "own" homeless people».

This is a false dichotomy. Western societies inequalities are growing larger and larger. Me trying to do my part is insignificant. Me donating to human or animal causes is a small dent into the mountains of inequalities we live on top of. Us collectively, we do make a difference, by donating, voting and generally keeping our eyes open about the world we live in...

Finally, an entirely anecdotal pov: I've witnessed several times extremely poor people going out of their ways to show solidarity to animals or humans. I've also witnessed an awful lot of extremely wealthy individuals complaining about the poor inconveniencing them by just being there, whose wealth was a direct consequences of their ancestors exploiting whose very same poor people." [19467512] -``` + ```text + id type author timestamp comment children + 19464423 comment adrianmonk 2019-03-22 16:58:19 "It's an apples and oranges comparison in the first place. There are security expenses related to prison populations. You need staff, facilities, equipment, etc. to manage prisoners behavior (prevent fights, etc.) and keep them from escaping. The two things have a different mission, so of course they're going to have different costs.

It's like saying a refrigerator is more expensive than a microwave. It doesn't mean anything because they do different things." [] + 19464461 comment sneakernets 2019-03-22 17:01:10 "Because the science is so solid that it's beating a dead horse at this point.

But with anti-vaxxers, It's like telling someone the red apple you're holding is red, yet they insist that it's green. You can't argue "the merits" with people like this." [19464582] + 19465288 comment derefr 2019-03-22 18:15:21 "Because we're talking about the backend-deployment+ops-jargon terms "website" and "webapp", not their general usage. Words can have precise jargon meanings which are different in different disciplines. This is where ops people tend to draw the line: a website is something you can deploy to e.g. an S3 bucket and it'll be fully functional, with no other dependencies that you have to maintain for it. A webapp is something that does have such dependencies that you need to set up and maintain—e.g. a database layer.

But even ignoring that, I also define the terms this way because of the prefix "web." A webapp isn't "an app on the web", but rather "an app powered by the web." An entirely-offline JavaScript SPA that is just served over the web, isn't a web-app. It's just a program that runs in a browser, just like a Flash or ActiveX or Java applet is a program that runs in a browser. (Is a Flash game a "web game"? It's usually considered a browser game, but that's not the same thing.)

We already have a term for the thing that {Flash, ActiveX, Java} applets are: apps. Offline JavaScript SPAs are just apps too. We don't need to add the prefix "web"; it's meaningless here. In any of those cases, if you took the exact same program, and slammed it into an Electron wrapper instead of into a domain-fronted S3 bucket, it would clearly not be a "web app" in any sense. Your SPA would just be "a JavaScript app that uses a browser DOM as its graphics toolkit." Well, that's just as true before you put it in the Electron wrapper.

So "web app", then, has a specific meaning, above and beyond "app." You need something extra. That something extra is a backend, which your browser—driven by the app's logic—interacts with over the web. That's what makes an app "a web app." (This definition intentionally encompasses both server-rendered dynamic HTML, and client-rendered JavaScript SPA apps. You don't need a frontend app; you just need a web backend that something is interacting with. That something can be the browser directly, by clicking links and submitting forms; or it can be a JavaScript frontend, using AJAX.)

A "web site", then, is a "web app" without the "app" part. If it's clear in the above definition what an "app" is, and what a "web app" is, then you can subtract one from the other to derive a definition of a "web not-app." That's a website: something powered by a web backend, which does not do any app things. If we decide that "app things" are basically "storing state", then a "site" is an "app" with no persistent state.

And since the definition of "web" here is about a backend, then the difference between a "web app" and a "web site" (a web not-app) is probably defined by the properties of the backend. So the difference about the ability of the web backend to store state. So a "web site" is a "web app" where the backend does no app things—i.e., stores no state." [] + 19465534 comment bduerst 2019-03-22 18:36:40 "Apple included: https://www.theguardian.com/commentisfree/2018/mar/04/apple-..." [] + 19466269 comment CalChris 2019-03-22 19:55:13 "> It has the same A12 CPU ... with 3 GB of RAM on the system-on-a-chip

Actually that's package-on-package. The LPDDR4X DRAM is glued (well, reflow soldered) to the back of the A12 Bionic.

https://www.techinsights.com/about-techinsights/overview/blo...

https://en.wikipedia.org/wiki/Package_on_package" [19468341] + 19466980 comment onetimemanytime 2019-03-22 21:07:25 ">>The insanity, here, is that you can't take the land the motorhome is on and build a studio on it.

apple and oranges. The permit to built the studio makes that building legit, kinda forever. A motor home, they can chase out with a new law, or just by enforcing existing laws." [] + 19467048 comment karambahh 2019-03-22 21:15:41 "I think you're comparing apples to oranges here.

If you reclaim a parking space for another use (such as building accommodation for families or an animal shelter), you're not depriving the car of anything, it's an expensive, large piece of metal and is not sentient.

Next, you'll say that you're depriving car owners from the practicality of parking their vehicles anywhere they like. I'm perfectly fine with depriving car owners from this convenience to allow a human being to have a roof over their head. (speaking from direct experience as I've just minutes ago had to park my car 1km away from home because the city is currently building housing and has restricted parking space nearby)

Then, some might argue that one should be ashamed of helping animals while humans are suffering. That's the exact same train of thought with «we can't allow more migrants in, we have to take care of our "own" homeless people».

This is a false dichotomy. Western societies inequalities are growing larger and larger. Me trying to do my part is insignificant. Me donating to human or animal causes is a small dent into the mountains of inequalities we live on top of. Us collectively, we do make a difference, by donating, voting and generally keeping our eyes open about the world we live in...

Finally, an entirely anecdotal pov: I've witnessed several times extremely poor people going out of their ways to show solidarity to animals or humans. I've also witnessed an awful lot of extremely wealthy individuals complaining about the poor inconveniencing them by just being there, whose wealth was a direct consequences of their ancestors exploiting whose very same poor people." [19467512] + ``` 2. Let's create the table for our Hacker News data: -```sql -CREATE TABLE hackernews ( + ```sql + CREATE TABLE hackernews ( id UInt32, type String, author String, @@ -36,15 +36,15 @@ CREATE TABLE hackernews ( comment String, children Array(UInt32), tokens Array(String) -) -ENGINE = MergeTree -ORDER BY toYYYYMMDD(timestamp) -``` + ) + ENGINE = MergeTree + ORDER BY toYYYYMMDD(timestamp) + ``` 3. We want to lowercase the `author` column, which is easily done with the [`lower` function](/sql-reference/functions/string-functions#lower). We also want to split the `comment` string into tokens and store the result in the `tokens` column, which can be done using the [`extractAll` function](/sql-reference/functions/string-search-functions#extractall). You do all of this in one `clickhouse-client` command - notice how the `comments.tsv` file is piped into the `clickhouse-client` using the `<` operator: -```bash -clickhouse-client \ + ```bash + clickhouse-client \ --host avw5r4qs3y.us-east-2.aws.clickhouse.cloud \ --secure \ --port 9440 \ @@ -61,40 +61,39 @@ clickhouse-client \ extractAll(comment, '\\w+') as tokens FROM input('id UInt32, type String, author String, timestamp DateTime, comment String, children Array(UInt32)') FORMAT TabSeparatedWithNames -" < comments.tsv -``` + " < comments.tsv + ``` -:::note -The `input` function is useful here as it allows us to convert the data as it's being inserted into the `hackernews` table. The argument to `input` is the format of the incoming raw data, and you will see this in many of the other table functions (where you specify a schema for the incoming data). -::: + :::note + The `input` function is useful here as it allows us to convert the data as it's being inserted into the `hackernews` table. The argument to `input` is the format of the incoming raw data, and you will see this in many of the other table functions (where you specify a schema for the incoming data). + ::: 4. That's it! The data is up in ClickHouse: -```sql -SELECT * -FROM hackernews -LIMIT 7 -``` + ```sql + SELECT * + FROM hackernews + LIMIT 7 + ``` -The result is: + The result is: -```response + ```response -│ 488 │ comment │ mynameishere │ 2007-02-22 14:48:18 │ "It's too bad. Javascript-in-the-browser and Ajax are both nasty hacks that force programmers to do all sorts of shameful things. And the result is--wanky html tricks. Java, for its faults, is fairly clean when run in the applet environment. It has every superiority over JITBAJAX, except for install issues and a chunky load process. Yahoo games seems like just about the only applet success story. Of course, back in the day, non-trivial Applets tended to be too large for the dial-up accounts people had. At least that is changed." │ [454927] │ ['It','s','too','bad','Javascript','in','the','browser','and','Ajax','are','both','nasty','hacks','that','force','programmers','to','do','all','sorts','of','shameful','things','And','the','result','is','wanky','html','tricks','Java','for','its','faults','is','fairly','clean','when','run','in','the','applet','environment','It','has','every','superiority','over','JITBAJAX','except','for','install','issues','and','a','chunky','load','process','Yahoo','games','seems','like','just','about','the','only','applet','success','story','Of','course','back','in','the','day','non','trivial','Applets','tended','to','be','too','large','for','the','dial','up','accounts','people','had','At','least','that','is','changed'] │ -│ 575 │ comment │ leoc │ 2007-02-23 00:09:49 │ "I can't find the reference now, but I *think* I've just read something suggesting that the install process for an Apollo applet will involve an "install-this-application?" confirmation dialog followed by a download of 30 seconds or so. If so then Apollo's less promising than I hoped. That kind of install may be low-friction by desktop-app standards but it doesn't compare to the ease of starting a browser-based AJAX or Flash application. (Consider how easy it is to use maps.google.com for the first time.)

Surely it will at least be that Apollo applications will run untrusted by default, and that an already-installed app will start automatically whenever you take your browser to the URL you downloaded it from?" │ [455071] │ ['I','can','t','find','the','reference','now','but','I','think','I','ve','just','read','something','suggesting','that','the','install','process','for','an','Apollo','applet','will','involve','an','34','install','this','application','34','confirmation','dialog','followed','by','a','download','of','30','seconds','or','so','If','so','then','Apollo','s','less','promising','than','I','hoped','That','kind','of','install','may','be','low','friction','by','desktop','app','standards','but','it','doesn','t','compare','to','the','ease','of','starting','a','browser','based','AJAX','or','Flash','application','Consider','how','easy','it','is','to','use','maps','google','com','for','the','first','time','p','Surely','it','will','at','least','be','that','Apollo','applications','will','run','untrusted','by','default','and','that','an','already','installed','app','will','start','automatically','whenever','you','take','your','browser','to','the','URL','you','downloaded','it','from'] │ -│ 3110 │ comment │ davidw │ 2007-03-09 09:19:58 │ "I'm very curious about this tsumobi thing, as it's basically exactly what Hecl is ( http://www.hecl.org ). I'd sort of abbandoned it as an idea for making any money with directly, though, figuring the advantage was just to be able to develop applications a lot faster. I was able to prototype ShopList ( http://shoplist.dedasys.com ) in a few minutes with it, for example.

Edit: BTW, I'd certainly be interested in chatting with the Tsumobi folks. It's a good idea - perhaps there are elements in common that can be reused from/added to Hecl, which is open source under a very liberal license, meaning you can take it and include it even in 'commercial' apps.

I really think that the 'common' bits in a space like that have to be either free or open source (think about browsers, html, JavaScript, java applets, etc...), and that that's not where the money is." │ [3147] │ ['I','m','very','curious','about','this','tsumobi','thing','as','it','s','basically','exactly','what','Hecl','is','http','www','hecl','org','I','d','sort','of','abbandoned','it','as','an','idea','for','making','any','money','with','directly','though','figuring','the','advantage','was','just','to','be','able','to','develop','applications','a','lot','faster','I','was','able','to','prototype','ShopList','http','shoplist','dedasys','com','in','a','few','minutes','with','it','for','example','p','Edit','BTW','I','d','certainly','be','interested','in','chatting','with','the','Tsumobi','folks','It','s','a','good','idea','perhaps','there','are','elements','in','common','that','can','be','reused','from','added','to','Hecl','which','is','open','source','under','a','very','liberal','license','meaning','you','can','take','it','and','include','it','even','in','commercial','apps','p','I','really','think','that','the','common','bits','in','a','space','like','that','have','to','be','either','free','or','open','source','think','about','browsers','html','javascript','java','applets','etc','and','that','that','s','not','where','the','money','is'] │ -│ 4016 │ comment │ mynameishere │ 2007-03-13 22:56:53 │ "http://www.tigerdirect.com/applications/SearchTools/item-details.asp?EdpNo=2853515&CatId=2511

Versus

http://store.apple.com/1-800-MY-APPLE/WebObjects/AppleStore?family=MacBookPro

These are comparable systems, but the Apple has, as I said, roughly an 800 dollar premium. Actually, the cheapest macbook pro costs the same as the high-end Toshiba. If you make good money, it's not a big deal. But when the girl in the coffeehouse asks me what kind of computer she should get to go along with her minimum wage, I'm basically scum to recommend an Apple." │ [] │ ['http','www','tigerdirect','com','applications','SearchTools','item','details','asp','EdpNo','2853515','CatId','2511','p','Versus','p','http','store','apple','com','1','800','MY','APPLE','WebObjects','AppleStore','family','MacBookPro','p','These','are','comparable','systems','but','the','Apple','has','as','I','said','roughly','an','800','dollar','premium','Actually','the','cheapest','macbook','pro','costs','the','same','as','the','high','end','Toshiba','If','you','make','good','money','it','s','not','a','big','deal','But','when','the','girl','in','the','coffeehouse','asks','me','what','kind','of','computer','she','should','get','to','go','along','with','her','minimum','wage','I','m','basically','scum','to','recommend','an','Apple'] │ -│ 4568 │ comment │ jwecker │ 2007-03-16 13:08:04 │ I know the feeling. The same feeling I had back when people were still writing java applets. Maybe a normal user doesn't feel it- maybe it's the programmer in us knowing that there's a big layer running between me and the browser... │ [] │ ['I','know','the','feeling','The','same','feeling','I','had','back','when','people','were','still','writing','java','applets','Maybe','a','normal','user','doesn','t','feel','it','maybe','it','s','the','programmer','in','us','knowing','that','there','s','a','big','layer','running','between','me','and','the','browser'] │ -│ 4900 │ comment │ lupin_sansei │ 2007-03-19 00:26:30 │ "The essence of Ajax is getting Javascript to communicate with the server without reloading the page. Although XmlHttpRequest is most convenient, there were other methods of doing this before XmlHttpRequest such as

- loading a 1 pixel image and sending data in the image's cookie

- loading server data through a tiny frame which contained XML or javascipt data

- Using a java applet to fetch the data on behalf of javascript" │ [] │ ['The','essence','of','Ajax','is','getting','Javascript','to','communicate','with','the','server','without','reloading','the','page','Although','XmlHttpRequest','is','most','convenient','there','were','other','methods','of','doing','this','before','XmlHttpRequest','such','as','p','loading','a','1','pixel','image','and','sending','data','in','the','image','s','cookie','p','loading','server','data','through','a','tiny','frame','which','contained','XML','or','javascipt','data','p','Using','a','java','applet','to','fetch','the','data','on','behalf','of','javascript'] │ -│ 5102 │ comment │ staunch │ 2007-03-20 02:42:47 │ "Well this is exactly the kind of thing that isn't very obvious. It sounds like once you're wealthy there's a new set of rules you have to live by. It's a shame everyone has had to re-learn these things for themselves because a few bad apples can control their jealousy.

Very good to hear it's somewhere in your essay queue though. I'll try not to get rich before you write it, so I have some idea of what to expect :-)" │ [] │ ['Well','this','is','exactly','the','kind','of','thing','that','isn','t','very','obvious','It','sounds','like','once','you','re','wealthy','there','s','a','new','set','of','rules','you','have','to','live','by','It','s','a','shame','everyone','has','had','to','re','learn','these','things','for','themselves','because','a','few','bad','apples','can','control','their','jealousy','p','Very','good','to','hear','it','s','somewhere','in','your','essay','queue','though','I','ll','try','not','to','get','rich','before','you','write','it','so','I','have','some','idea','of','what','to','expect'] │ - -``` + │ 488 │ comment │ mynameishere │ 2007-02-22 14:48:18 │ "It's too bad. Javascript-in-the-browser and Ajax are both nasty hacks that force programmers to do all sorts of shameful things. And the result is--wanky html tricks. Java, for its faults, is fairly clean when run in the applet environment. It has every superiority over JITBAJAX, except for install issues and a chunky load process. Yahoo games seems like just about the only applet success story. Of course, back in the day, non-trivial Applets tended to be too large for the dial-up accounts people had. At least that is changed." │ [454927] │ ['It','s','too','bad','Javascript','in','the','browser','and','Ajax','are','both','nasty','hacks','that','force','programmers','to','do','all','sorts','of','shameful','things','And','the','result','is','wanky','html','tricks','Java','for','its','faults','is','fairly','clean','when','run','in','the','applet','environment','It','has','every','superiority','over','JITBAJAX','except','for','install','issues','and','a','chunky','load','process','Yahoo','games','seems','like','just','about','the','only','applet','success','story','Of','course','back','in','the','day','non','trivial','Applets','tended','to','be','too','large','for','the','dial','up','accounts','people','had','At','least','that','is','changed'] │ + │ 575 │ comment │ leoc │ 2007-02-23 00:09:49 │ "I can't find the reference now, but I *think* I've just read something suggesting that the install process for an Apollo applet will involve an "install-this-application?" confirmation dialog followed by a download of 30 seconds or so. If so then Apollo's less promising than I hoped. That kind of install may be low-friction by desktop-app standards but it doesn't compare to the ease of starting a browser-based AJAX or Flash application. (Consider how easy it is to use maps.google.com for the first time.)

Surely it will at least be that Apollo applications will run untrusted by default, and that an already-installed app will start automatically whenever you take your browser to the URL you downloaded it from?" │ [455071] │ ['I','can','t','find','the','reference','now','but','I','think','I','ve','just','read','something','suggesting','that','the','install','process','for','an','Apollo','applet','will','involve','an','34','install','this','application','34','confirmation','dialog','followed','by','a','download','of','30','seconds','or','so','If','so','then','Apollo','s','less','promising','than','I','hoped','That','kind','of','install','may','be','low','friction','by','desktop','app','standards','but','it','doesn','t','compare','to','the','ease','of','starting','a','browser','based','AJAX','or','Flash','application','Consider','how','easy','it','is','to','use','maps','google','com','for','the','first','time','p','Surely','it','will','at','least','be','that','Apollo','applications','will','run','untrusted','by','default','and','that','an','already','installed','app','will','start','automatically','whenever','you','take','your','browser','to','the','URL','you','downloaded','it','from'] │ + │ 3110 │ comment │ davidw │ 2007-03-09 09:19:58 │ "I'm very curious about this tsumobi thing, as it's basically exactly what Hecl is ( http://www.hecl.org ). I'd sort of abbandoned it as an idea for making any money with directly, though, figuring the advantage was just to be able to develop applications a lot faster. I was able to prototype ShopList ( http://shoplist.dedasys.com ) in a few minutes with it, for example.

Edit: BTW, I'd certainly be interested in chatting with the Tsumobi folks. It's a good idea - perhaps there are elements in common that can be reused from/added to Hecl, which is open source under a very liberal license, meaning you can take it and include it even in 'commercial' apps.

I really think that the 'common' bits in a space like that have to be either free or open source (think about browsers, html, JavaScript, java applets, etc...), and that that's not where the money is." │ [3147] │ ['I','m','very','curious','about','this','tsumobi','thing','as','it','s','basically','exactly','what','Hecl','is','http','www','hecl','org','I','d','sort','of','abbandoned','it','as','an','idea','for','making','any','money','with','directly','though','figuring','the','advantage','was','just','to','be','able','to','develop','applications','a','lot','faster','I','was','able','to','prototype','ShopList','http','shoplist','dedasys','com','in','a','few','minutes','with','it','for','example','p','Edit','BTW','I','d','certainly','be','interested','in','chatting','with','the','Tsumobi','folks','It','s','a','good','idea','perhaps','there','are','elements','in','common','that','can','be','reused','from','added','to','Hecl','which','is','open','source','under','a','very','liberal','license','meaning','you','can','take','it','and','include','it','even','in','commercial','apps','p','I','really','think','that','the','common','bits','in','a','space','like','that','have','to','be','either','free','or','open','source','think','about','browsers','html','javascript','java','applets','etc','and','that','that','s','not','where','the','money','is'] │ + │ 4016 │ comment │ mynameishere │ 2007-03-13 22:56:53 │ "http://www.tigerdirect.com/applications/SearchTools/item-details.asp?EdpNo=2853515&CatId=2511

Versus

http://store.apple.com/1-800-MY-APPLE/WebObjects/AppleStore?family=MacBookPro

These are comparable systems, but the Apple has, as I said, roughly an 800 dollar premium. Actually, the cheapest macbook pro costs the same as the high-end Toshiba. If you make good money, it's not a big deal. But when the girl in the coffeehouse asks me what kind of computer she should get to go along with her minimum wage, I'm basically scum to recommend an Apple." │ [] │ ['http','www','tigerdirect','com','applications','SearchTools','item','details','asp','EdpNo','2853515','CatId','2511','p','Versus','p','http','store','apple','com','1','800','MY','APPLE','WebObjects','AppleStore','family','MacBookPro','p','These','are','comparable','systems','but','the','Apple','has','as','I','said','roughly','an','800','dollar','premium','Actually','the','cheapest','macbook','pro','costs','the','same','as','the','high','end','Toshiba','If','you','make','good','money','it','s','not','a','big','deal','But','when','the','girl','in','the','coffeehouse','asks','me','what','kind','of','computer','she','should','get','to','go','along','with','her','minimum','wage','I','m','basically','scum','to','recommend','an','Apple'] │ + │ 4568 │ comment │ jwecker │ 2007-03-16 13:08:04 │ I know the feeling. The same feeling I had back when people were still writing java applets. Maybe a normal user doesn't feel it- maybe it's the programmer in us knowing that there's a big layer running between me and the browser... │ [] │ ['I','know','the','feeling','The','same','feeling','I','had','back','when','people','were','still','writing','java','applets','Maybe','a','normal','user','doesn','t','feel','it','maybe','it','s','the','programmer','in','us','knowing','that','there','s','a','big','layer','running','between','me','and','the','browser'] │ + │ 4900 │ comment │ lupin_sansei │ 2007-03-19 00:26:30 │ "The essence of Ajax is getting Javascript to communicate with the server without reloading the page. Although XmlHttpRequest is most convenient, there were other methods of doing this before XmlHttpRequest such as

- loading a 1 pixel image and sending data in the image's cookie

- loading server data through a tiny frame which contained XML or javascipt data

- Using a java applet to fetch the data on behalf of javascript" │ [] │ ['The','essence','of','Ajax','is','getting','Javascript','to','communicate','with','the','server','without','reloading','the','page','Although','XmlHttpRequest','is','most','convenient','there','were','other','methods','of','doing','this','before','XmlHttpRequest','such','as','p','loading','a','1','pixel','image','and','sending','data','in','the','image','s','cookie','p','loading','server','data','through','a','tiny','frame','which','contained','XML','or','javascipt','data','p','Using','a','java','applet','to','fetch','the','data','on','behalf','of','javascript'] │ + │ 5102 │ comment │ staunch │ 2007-03-20 02:42:47 │ "Well this is exactly the kind of thing that isn't very obvious. It sounds like once you're wealthy there's a new set of rules you have to live by. It's a shame everyone has had to re-learn these things for themselves because a few bad apples can control their jealousy.

Very good to hear it's somewhere in your essay queue though. I'll try not to get rich before you write it, so I have some idea of what to expect :-)" │ [] │ ['Well','this','is','exactly','the','kind','of','thing','that','isn','t','very','obvious','It','sounds','like','once','you','re','wealthy','there','s','a','new','set','of','rules','you','have','to','live','by','It','s','a','shame','everyone','has','had','to','re','learn','these','things','for','themselves','because','a','few','bad','apples','can','control','their','jealousy','p','Very','good','to','hear','it','s','somewhere','in','your','essay','queue','though','I','ll','try','not','to','get','rich','before','you','write','it','so','I','have','some','idea','of','what','to','expect'] │ + ``` 5. Another option is to use a tool like `cat` to stream the file to `clickhouse-client`. For example, the following command has the same result as using the `<` operator: -```bash -cat comments.tsv | clickhouse-client \ + ```bash + cat comments.tsv | clickhouse-client \ --host avw5r4qs3y.us-east-2.aws.clickhouse.cloud \ --secure \ --port 9440 \ @@ -111,7 +110,7 @@ cat comments.tsv | clickhouse-client \ extractAll(comment, '\\w+') as tokens FROM input('id UInt32, type String, author String, timestamp DateTime, comment String, children Array(UInt32)') FORMAT TabSeparatedWithNames -" -``` + " + ``` -Visit the [docs page on `clickhouse-client`](/interfaces/cli) for details on how to install `clickhouse-client` on your local operating system. + Visit the [docs page on `clickhouse-client`](/interfaces/cli) for details on how to install `clickhouse-client` on your local operating system. diff --git a/docs/integrations/data-ingestion/kafka/confluent/confluent-cloud.md b/docs/integrations/data-ingestion/kafka/confluent/confluent-cloud.md index 9f04a8fc79f..d4f31042f27 100644 --- a/docs/integrations/data-ingestion/kafka/confluent/confluent-cloud.md +++ b/docs/integrations/data-ingestion/kafka/confluent/confluent-cloud.md @@ -46,11 +46,9 @@ Creating a topic on Confluent Cloud is fairly simple, and there are detailed ins #### Gather your connection details {#gather-your-connection-details} - #### Install Connector {#install-connector} Install the fully managed ClickHouse Sink Connector on Confluent Cloud following the [official documentation](https://docs.confluent.io/cloud/current/connectors/cc-clickhouse-sink-connector/cc-clickhouse-sink.html). - #### Configure the Connector {#configure-the-connector} During the configuration of the ClickHouse Sink Connector, you will need to provide the following details: - hostname of your ClickHouse server @@ -59,7 +57,7 @@ During the configuration of the ClickHouse Sink Connector, you will need to prov - database name in ClickHouse where the data will be written - topic name in Kafka that will be used to write data to ClickHouse -The Confluent Cloud UI supports advanced configuration options to adjust poll intervals, batch sizes, and other parameters to optimize performance. + The Confluent Cloud UI supports advanced configuration options to adjust poll intervals, batch sizes, and other parameters to optimize performance. #### Known limitations {#known-limitations} * See the list of [Connectors limitations in the official docs](https://docs.confluent.io/cloud/current/connectors/cc-clickhouse-sink-connector/cc-clickhouse-sink.html#limitations) diff --git a/docs/integrations/data-ingestion/kafka/confluent/kafka-connect-http.md b/docs/integrations/data-ingestion/kafka/confluent/kafka-connect-http.md index 21589e54ed8..604b865bc3e 100644 --- a/docs/integrations/data-ingestion/kafka/confluent/kafka-connect-http.md +++ b/docs/integrations/data-ingestion/kafka/confluent/kafka-connect-http.md @@ -13,7 +13,6 @@ import httpAuth from '@site/static/images/integrations/data-ingestion/kafka/conf import httpAdvanced from '@site/static/images/integrations/data-ingestion/kafka/confluent/http_advanced.png'; import createMessageInTopic from '@site/static/images/integrations/data-ingestion/kafka/confluent/create_message_in_topic.png'; - # Confluent HTTP sink connector The HTTP Sink Connector is data type agnostic and thus does not need a Kafka schema as well as supporting ClickHouse specific data types such as Maps and Arrays. This additional flexibility comes at a slight increase in configuration complexity. @@ -28,19 +27,18 @@ Below we describe a simple installation, pulling messages from a single Kafka to #### 1. Gather your connection details {#1-gather-your-connection-details} - #### 2. Run Kafka Connect and the HTTP sink connector {#2-run-kafka-connect-and-the-http-sink-connector} You have two options: * **Self-managed:** Download the Confluent package and install it locally. Follow the installation instructions for installing the connector as documented [here](https://docs.confluent.io/kafka-connect-http/current/overview.html). -If you use the confluent-hub installation method, your local configuration files will be updated. + If you use the confluent-hub installation method, your local configuration files will be updated. * **Confluent Cloud:** A fully managed version of HTTP Sink is available for those using Confluent Cloud for their Kafka hosting. This requires your ClickHouse environment to be accessible from Confluent Cloud. -:::note - The following examples are using Confluent Cloud. -::: + :::note + The following examples are using Confluent Cloud. + ::: #### 3. Create destination table in ClickHouse {#3-create-destination-table-in-clickhouse} @@ -73,12 +71,11 @@ Configure HTTP Sink Connector: * `Auth username` - ClickHouse username * `Auth password` - ClickHouse password -:::note - This HTTP Url is error-prone. Ensure escaping is precise to avoid issues. -::: + :::note + This HTTP Url is error-prone. Ensure escaping is precise to avoid issues. + ::: - -
+ * Configuration * `Input Kafka record value format`Depends on your source data but in most cases JSON or Avro. We assume `JSON` in the following settings. @@ -90,7 +87,7 @@ Configure HTTP Sink Connector: * `Retry on HTTP codes` - 400-500 but adapt as required e.g. this may change if you have an HTTP proxy in front of ClickHouse. * `Maximum Reties` - the default (10) is appropriate but feel to adjust for more robust retries. - + #### 5. Testing the connectivity {#5-testing-the-connectivity} Create an message in a topic configured by your HTTP Sink @@ -137,7 +134,6 @@ http://localhost:8123?query=INSERT%20INTO%20default.github%20FORMAT%20JSONEachRo The following additional parameters are relevant to using the HTTP Sink with ClickHouse. A complete parameter list can be found [here](https://docs.confluent.io/kafka-connect-http/current/connector_config.html): - * `request.method` - Set to **POST** * `retry.on.status.codes` - Set to 400-500 to retry on any error codes. Refine based expected errors in data. * `request.body.format` - In most cases this will be JSON. @@ -150,15 +146,14 @@ The following additional parameters are relevant to using the HTTP Sink with Cli * `key.converter` - set according to the types of your keys. * `value.converter` - set based on the type of data on your topic. This data does not need a schema. The format here must be consistent with the FORMAT specified in the parameter `http.api.url`. The simplest here is to use JSON and the org.apache.kafka.connect.json.JsonConverter converter. Treating the value as a string, via the converter org.apache.kafka.connect.storage.StringConverter, is also possible - although this will require the user to extract a value in the insert statement using functions. [Avro format](../../../../interfaces/formats.md#data-format-avro) is also supported in ClickHouse if using the io.confluent.connect.avro.AvroConverter converter. -A full list of settings, including how to configure a proxy, retries, and advanced SSL, can be found [here](https://docs.confluent.io/kafka-connect-http/current/connector_config.html). + A full list of settings, including how to configure a proxy, retries, and advanced SSL, can be found [here](https://docs.confluent.io/kafka-connect-http/current/connector_config.html). -Example configuration files for the Github sample data can be found [here](https://github.com/ClickHouse/clickhouse-docs/tree/main/docs/integrations/data-ingestion/kafka/code/connectors/http_sink), assuming Connect is run in standalone mode and Kafka is hosted in Confluent Cloud. + Example configuration files for the Github sample data can be found [here](https://github.com/ClickHouse/clickhouse-docs/tree/main/docs/integrations/data-ingestion/kafka/code/connectors/http_sink), assuming Connect is run in standalone mode and Kafka is hosted in Confluent Cloud. ##### 2. Create the ClickHouse table {#2-create-the-clickhouse-table} Ensure the table has been created. An example for a minimal github dataset using a standard MergeTree is shown below. - ```sql CREATE TABLE github ( @@ -201,7 +196,6 @@ head -n 10000 github_all_columns.ndjson | kcat -b : -X security.prot A simple read on the target table "Github" should confirm the insertion of data. - ```sql SELECT count() FROM default.github; diff --git a/docs/integrations/data-ingestion/kafka/index.md b/docs/integrations/data-ingestion/kafka/index.md index bde0ca73608..51bc7a9408a 100644 --- a/docs/integrations/data-ingestion/kafka/index.md +++ b/docs/integrations/data-ingestion/kafka/index.md @@ -100,6 +100,6 @@ To get started using the Kafka table engine, see the [reference documentation](. * **Custom code** - Custom code using Kafka and ClickHouse [client libraries](../../language-clients/index.md) may be appropriate in cases where custom processing of events is required. -[BYOC]: ../../../cloud/reference/byoc.md -[Cloud]: ../../../cloud-index.md -[Self-hosted]: ../../../intro.md + [BYOC]: ../../../cloud/reference/byoc.md + [Cloud]: ../../../cloud-index.md + [Self-hosted]: ../../../intro.md diff --git a/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md b/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md index d8dcb3f4969..f060bde2eae 100644 --- a/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md +++ b/docs/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md @@ -55,24 +55,24 @@ To install the plugin, follow these steps: - Add a path with the plugin director to [plugin.path](https://kafka.apache.org/documentation/#connectconfigs_plugin.path) configuration in your Connect properties file to allow Confluent Platform to find the plugin. - Provide a topic name, ClickHouse instance hostname, and password in config. -```yml -connector.class=com.clickhouse.kafka.connect.ClickHouseSinkConnector -tasks.max=1 -topics= -ssl=true -jdbcConnectionProperties=?sslmode=STRICT -security.protocol=SSL -hostname= -database= -password= -ssl.truststore.location=/tmp/kafka.client.truststore.jks -port=8443 -value.converter.schemas.enable=false -value.converter=org.apache.kafka.connect.json.JsonConverter -exactlyOnce=true -username=default -schemas.enable=false -``` + ```yml + connector.class=com.clickhouse.kafka.connect.ClickHouseSinkConnector + tasks.max=1 + topics= + ssl=true + jdbcConnectionProperties=?sslmode=STRICT + security.protocol=SSL + hostname= + database= + password= + ssl.truststore.location=/tmp/kafka.client.truststore.jks + port=8443 + value.converter.schemas.enable=false + value.converter=org.apache.kafka.connect.json.JsonConverter + exactlyOnce=true + username=default + schemas.enable=false + ``` - Restart the Confluent Platform. - If you use Confluent Platform, log into Confluent Control Center UI to verify the ClickHouse Sink is available in the list of available connectors. @@ -87,36 +87,36 @@ To connect the ClickHouse Sink to the ClickHouse server, you need to provide: - topics or topics.regex: the Kafka topics to poll - topic names must match table names (**required**) - key and value converters: set based on the type of data on your topic. Required if not already defined in worker config. -The full table of configuration options: - -| Property Name | Description | Default Value | -|-------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------| -| `hostname` (Required) | The hostname or IP address of the server | N/A | -| `port` | The ClickHouse port - default is 8443 (for HTTPS in the cloud), but for HTTP (the default for self-hosted) it should be 8123 | `8443` | -| `ssl` | Enable ssl connection to ClickHouse | `true` | -| `jdbcConnectionProperties` | Connection properties when connecting to Clickhouse. Must start with `?` and joined by `&` between `param=value` | `""` | -| `username` | ClickHouse database username | `default` | -| `password` (Required) | ClickHouse database password | N/A | -| `database` | ClickHouse database name | `default` | -| `connector.class` (Required) | Connector Class(explicit set and keep as the default value) | `"com.clickhouse.kafka.connect.ClickHouseSinkConnector"` | -| `tasks.max` | The number of Connector Tasks | `"1"` | -| `errors.retry.timeout` | ClickHouse JDBC Retry Timeout | `"60"` | -| `exactlyOnce` | Exactly Once Enabled | `"false"` | -| `topics` (Required) | The Kafka topics to poll - topic names must match table names | `""` | -| `key.converter` (Required* - See Description) | Set according to the types of your keys. Required here if you are passing keys (and not defined in worker config). | `"org.apache.kafka.connect.storage.StringConverter"` | -| `value.converter` (Required* - See Description) | Set based on the type of data on your topic. Supported: - JSON, String, Avro or Protobuf formats. Required here if not defined in worker config. | `"org.apache.kafka.connect.json.JsonConverter"` | -| `value.converter.schemas.enable` | Connector Value Converter Schema Support | `"false"` | -| `errors.tolerance` | Connector Error Tolerance. Supported: none, all | `"none"` | -| `errors.deadletterqueue.topic.name` | If set (with errors.tolerance=all), a DLQ will be used for failed batches (see [Troubleshooting](#troubleshooting)) | `""` | -| `errors.deadletterqueue.context.headers.enable` | Adds additional headers for the DLQ | `""` | -| `clickhouseSettings` | Comma-separated list of ClickHouse settings (e.g. "insert_quorum=2, etc...") | `""` | -| `topic2TableMap` | Comma-separated list that maps topic names to table names (e.g. "topic1=table1, topic2=table2, etc...") | `""` | -| `tableRefreshInterval` | Time (in seconds) to refresh the table definition cache | `0` | -| `keeperOnCluster` | Allows configuration of ON CLUSTER parameter for self-hosted instances (e.g. `ON CLUSTER clusterNameInConfigFileDefinition`) for exactly-once connect_state table (see [Distributed DDL Queries](/sql-reference/distributed-ddl) | `""` | -| `bypassRowBinary` | Allows disabling use of RowBinary and RowBinaryWithDefaults for Schema-based data (Avro, Protobuf, etc.) - should only be used when data will have missing columns, and Nullable/Default are unacceptable | `"false"` | -| `dateTimeFormats` | Date time formats for parsing DateTime64 schema fields, separated by `;` (e.g. `someDateField=yyyy-MM-dd HH:mm:ss.SSSSSSSSS;someOtherDateField=yyyy-MM-dd HH:mm:ss`). | `""` | -| `tolerateStateMismatch` | Allows the connector to drop records "earlier" than the current offset stored AFTER_PROCESSING (e.g. if offset 5 is sent, and offset 250 was the last recorded offset) | `"false"` | -| `ignorePartitionsWhenBatching` | Will ignore partition when collecting messages for insert (though only if `exactlyOnce` is `false`). Performance Note: The more connector tasks, the fewer kafka partitions assigned per task - this can mean diminishing returns. | `"false"` | + The full table of configuration options: + + | Property Name | Description | Default Value | + |-------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------| + | `hostname` (Required) | The hostname or IP address of the server | N/A | + | `port` | The ClickHouse port - default is 8443 (for HTTPS in the cloud), but for HTTP (the default for self-hosted) it should be 8123 | `8443` | + | `ssl` | Enable ssl connection to ClickHouse | `true` | + | `jdbcConnectionProperties` | Connection properties when connecting to Clickhouse. Must start with `?` and joined by `&` between `param=value` | `""` | + | `username` | ClickHouse database username | `default` | + | `password` (Required) | ClickHouse database password | N/A | + | `database` | ClickHouse database name | `default` | + | `connector.class` (Required) | Connector Class(explicit set and keep as the default value) | `"com.clickhouse.kafka.connect.ClickHouseSinkConnector"` | + | `tasks.max` | The number of Connector Tasks | `"1"` | + | `errors.retry.timeout` | ClickHouse JDBC Retry Timeout | `"60"` | + | `exactlyOnce` | Exactly Once Enabled | `"false"` | + | `topics` (Required) | The Kafka topics to poll - topic names must match table names | `""` | + | `key.converter` (Required* - See Description) | Set according to the types of your keys. Required here if you are passing keys (and not defined in worker config). | `"org.apache.kafka.connect.storage.StringConverter"` | + | `value.converter` (Required* - See Description) | Set based on the type of data on your topic. Supported: - JSON, String, Avro or Protobuf formats. Required here if not defined in worker config. | `"org.apache.kafka.connect.json.JsonConverter"` | + | `value.converter.schemas.enable` | Connector Value Converter Schema Support | `"false"` | + | `errors.tolerance` | Connector Error Tolerance. Supported: none, all | `"none"` | + | `errors.deadletterqueue.topic.name` | If set (with errors.tolerance=all), a DLQ will be used for failed batches (see [Troubleshooting](#troubleshooting)) | `""` | + | `errors.deadletterqueue.context.headers.enable` | Adds additional headers for the DLQ | `""` | + | `clickhouseSettings` | Comma-separated list of ClickHouse settings (e.g. "insert_quorum=2, etc...") | `""` | + | `topic2TableMap` | Comma-separated list that maps topic names to table names (e.g. "topic1=table1, topic2=table2, etc...") | `""` | + | `tableRefreshInterval` | Time (in seconds) to refresh the table definition cache | `0` | + | `keeperOnCluster` | Allows configuration of ON CLUSTER parameter for self-hosted instances (e.g. `ON CLUSTER clusterNameInConfigFileDefinition`) for exactly-once connect_state table (see [Distributed DDL Queries](/sql-reference/distributed-ddl) | `""` | + | `bypassRowBinary` | Allows disabling use of RowBinary and RowBinaryWithDefaults for Schema-based data (Avro, Protobuf, etc.) - should only be used when data will have missing columns, and Nullable/Default are unacceptable | `"false"` | + | `dateTimeFormats` | Date time formats for parsing DateTime64 schema fields, separated by `;` (e.g. `someDateField=yyyy-MM-dd HH:mm:ss.SSSSSSSSS;someOtherDateField=yyyy-MM-dd HH:mm:ss`). | `""` | + | `tolerateStateMismatch` | Allows the connector to drop records "earlier" than the current offset stored AFTER_PROCESSING (e.g. if offset 5 is sent, and offset 250 was the last recorded offset) | `"false"` | + | `ignorePartitionsWhenBatching` | Will ignore partition when collecting messages for insert (though only if `exactlyOnce` is `false`). Performance Note: The more connector tasks, the fewer kafka partitions assigned per task - this can mean diminishing returns. | `"false"` | ### Target tables {#target-tables} @@ -155,13 +155,13 @@ Sink, use [Kafka Connect Transformations](https://docs.confluent.io/platform/cur | org.apache.kafka.connect.data.Timestamp | Int32 / Date32 | ✅ | No | | org.apache.kafka.connect.data.Decimal | Decimal | ✅ | No | -- (1) - JSON is supported only when ClickHouse settings has `input_format_binary_read_json_as_string=1`. This works only for RowBinary format family and the setting affects all columns in the insert request so they all should be a string. Connector will convert STRUCT to a JSON string in this case. +- (1) - JSON is supported only when ClickHouse settings has `input_format_binary_read_json_as_string=1`. This works only for RowBinary format family and the setting affects all columns in the insert request so they all should be a string. Connector will convert STRUCT to a JSON string in this case. -- (2) - When struct has unions like `oneof` then converter should be configured to NOT add prefix/suffix to a field names. There is `generate.index.for.unions=false` [setting for `ProtobufConverter`](https://docs.confluent.io/platform/current/schema-registry/connect.html#protobuf). +- (2) - When struct has unions like `oneof` then converter should be configured to NOT add prefix/suffix to a field names. There is `generate.index.for.unions=false` [setting for `ProtobufConverter`](https://docs.confluent.io/platform/current/schema-registry/connect.html#protobuf). -**Without a schema declared:** + **Without a schema declared:** -A record is converted into JSON and sent to ClickHouse as a value in [JSONEachRow](../../../sql-reference/formats.mdx#jsoneachrow) format. + A record is converted into JSON and sent to ClickHouse as a value in [JSONEachRow](../../../sql-reference/formats.mdx#jsoneachrow) format. ### Configuration recipes {#configuration-recipes} @@ -373,21 +373,21 @@ To fix this, you would need to delete the old values stored for that given topic Right now the focus is on identifying errors that are transient and can be retried, including: - `ClickHouseException` - This is a generic exception that can be thrown by ClickHouse. - It is usually thrown when the server is overloaded and the following error codes are considered particularly transient: - - 3 - UNEXPECTED_END_OF_FILE - - 159 - TIMEOUT_EXCEEDED - - 164 - READONLY - - 202 - TOO_MANY_SIMULTANEOUS_QUERIES - - 203 - NO_FREE_CONNECTION - - 209 - SOCKET_TIMEOUT - - 210 - NETWORK_ERROR - - 242 - TABLE_IS_READ_ONLY - - 252 - TOO_MANY_PARTS - - 285 - TOO_FEW_LIVE_REPLICAS - - 319 - UNKNOWN_STATUS_OF_INSERT - - 425 - SYSTEM_ERROR - - 999 - KEEPER_EXCEPTION - - 1002 - UNKNOWN_EXCEPTION + It is usually thrown when the server is overloaded and the following error codes are considered particularly transient: + - 3 - UNEXPECTED_END_OF_FILE + - 159 - TIMEOUT_EXCEEDED + - 164 - READONLY + - 202 - TOO_MANY_SIMULTANEOUS_QUERIES + - 203 - NO_FREE_CONNECTION + - 209 - SOCKET_TIMEOUT + - 210 - NETWORK_ERROR + - 242 - TABLE_IS_READ_ONLY + - 252 - TOO_MANY_PARTS + - 285 - TOO_FEW_LIVE_REPLICAS + - 319 - UNKNOWN_STATUS_OF_INSERT + - 425 - SYSTEM_ERROR + - 999 - KEEPER_EXCEPTION + - 1002 - UNKNOWN_EXCEPTION - `SocketTimeoutException` - This is thrown when the socket times out. - `UnknownHostException` - This is thrown when the host cannot be resolved. - `IOException` - This is thrown when there is a problem with the network. diff --git a/docs/integrations/data-ingestion/kafka/kafka-connect-jdbc.md b/docs/integrations/data-ingestion/kafka/kafka-connect-jdbc.md index 501d2abf857..57e46fc4b94 100644 --- a/docs/integrations/data-ingestion/kafka/kafka-connect-jdbc.md +++ b/docs/integrations/data-ingestion/kafka/kafka-connect-jdbc.md @@ -29,14 +29,12 @@ The JDBC Connector is distributed under the [Confluent Community License](https: #### 1. Install Kafka Connect and Connector {#1-install-kafka-connect-and-connector} - We assume you have downloaded the Confluent package and installed it locally. Follow the installation instructions for installing the connector as documented [here](https://docs.confluent.io/kafka-connect-jdbc/current/#install-the-jdbc-connector). If you use the confluent-hub installation method, your local configuration files will be updated. For sending data to ClickHouse from Kafka, we use the Sink component of the connector. - #### 2. Download and install the JDBC Driver {#2-download-and-install-the-jdbc-driver} Download and install the ClickHouse JDBC driver `clickhouse-jdbc--shaded.jar` from [here](https://github.com/ClickHouse/clickhouse-java/releases). Install this into Kafka Connect following the details [here](https://docs.confluent.io/kafka-connect-jdbc/current/#installing-jdbc-drivers). Other drivers may work but have not been tested. @@ -53,7 +51,6 @@ Follow [these instructions](https://docs.confluent.io/cloud/current/cp-component The following parameters are relevant to using the JDBC connector with ClickHouse. A full parameter list can be found [here](https://docs.confluent.io/kafka-connect-jdbc/current/sink-connector/index.html): - * `_connection.url_` - this should take the form of `jdbc:clickhouse://<clickhouse host>:<clickhouse http port>/<target database>` * `connection.user` - a user with write access to the target database * `table.name.format`- ClickHouse table to insert data. This must exist. @@ -69,15 +66,14 @@ The following parameters are relevant to using the JDBC connector with ClickHous * `key.converter` - Set according to the types of your keys. * `value.converter` - Set based on the type of data on your topic. This data must have a supported schema - JSON, Avro or Protobuf formats. -If using our sample dataset for testing, ensure the following are set: + If using our sample dataset for testing, ensure the following are set: * `value.converter.schemas.enable` - Set to false as we utilize a schema registry. Set to true if you are embedding the schema in each message. * `key.converter` - Set to "org.apache.kafka.connect.storage.StringConverter". We utilise String keys. * `value.converter` - Set "io.confluent.connect.json.JsonSchemaConverter". * `value.converter.schema.registry.url` - Set to the schema server url along with the credentials for the schema server via the parameter `value.converter.schema.registry.basic.auth.user.info`. -Example configuration files for the Github sample data can be found [here](https://github.com/ClickHouse/kafka-samples/tree/main/github_events/jdbc_sink), assuming Connect is run in standalone mode and Kafka is hosted in Confluent Cloud. - + Example configuration files for the Github sample data can be found [here](https://github.com/ClickHouse/kafka-samples/tree/main/github_events/jdbc_sink), assuming Connect is run in standalone mode and Kafka is hosted in Confluent Cloud. #### 4. Create the ClickHouse table {#4-create-the-clickhouse-table} @@ -121,7 +117,6 @@ Start Kafka Connect in either [standalone](https://docs.confluent.io/cloud/curre #### 6. Add data to Kafka {#6-add-data-to-kafka} - Insert messages to Kafka using the [script and config](https://github.com/ClickHouse/kafka-samples/tree/main/producer) provided. You will need to modify github.config to include your Kafka credentials. The script is currently configured for use with Confluent Cloud. ```bash @@ -136,7 +131,6 @@ Kafka Connect should begin consuming messages and inserting rows into ClickHouse A simple read on the target table "Github" should confirm data insertion. - ```sql SELECT count() FROM default.github; ``` diff --git a/docs/integrations/data-ingestion/kafka/kafka-table-engine-named-collections.md b/docs/integrations/data-ingestion/kafka/kafka-table-engine-named-collections.md index 81edf388e47..29ee735790d 100644 --- a/docs/integrations/data-ingestion/kafka/kafka-table-engine-named-collections.md +++ b/docs/integrations/data-ingestion/kafka/kafka-table-engine-named-collections.md @@ -14,7 +14,7 @@ In this guide, we will explore how to connect ClickHouse to Kafka using named co - Changes to settings can be made without altering SQL table definitions. - Easier review and troubleshooting of configurations by inspecting a single configuration file. -This guide has been tested on Apache Kafka 3.4.1 and ClickHouse 24.5.1. + This guide has been tested on Apache Kafka 3.4.1 and ClickHouse 24.5.1. ## Assumptions {#assumptions} @@ -164,7 +164,7 @@ Create a materialized view to insert data from the first Kafka table into the fi ```sql CREATE MATERIALIZED VIEW kafka_testing.cluster_1_mv ON CLUSTER STAGE_CLICKHOUSE_CLUSTER TO first_replicated_table AS -SELECT +SELECT id, first_name, last_name @@ -175,7 +175,7 @@ Create a materialized view to insert data from the second Kafka table into the s ```sql CREATE MATERIALIZED VIEW kafka_testing.cluster_2_mv ON CLUSTER STAGE_CLICKHOUSE_CLUSTER TO second_replicated_table AS -SELECT +SELECT id, first_name, last_name @@ -188,15 +188,15 @@ You should now see the relative consumer groups on your Kafka clusters: - `cluster_1_clickhouse_consumer` on `cluster_1` - `cluster_2_clickhouse_consumer` on `cluster_2` -Run the following queries on any of your ClickHouse nodes to see the data in both tables: + Run the following queries on any of your ClickHouse nodes to see the data in both tables: -```sql -SELECT * FROM first_replicated_table LIMIT 10; -``` + ```sql + SELECT * FROM first_replicated_table LIMIT 10; + ``` -```sql -SELECT * FROM second_replicated_table LIMIT 10; -``` + ```sql + SELECT * FROM second_replicated_table LIMIT 10; + ``` ### Note {#note} diff --git a/docs/integrations/data-ingestion/kafka/kafka-table-engine.md b/docs/integrations/data-ingestion/kafka/kafka-table-engine.md index 8d9d6bafa1c..40788b99b62 100644 --- a/docs/integrations/data-ingestion/kafka/kafka-table-engine.md +++ b/docs/integrations/data-ingestion/kafka/kafka-table-engine.md @@ -36,17 +36,14 @@ To persist this data from a read of the table engine, we need a means of capturi #### Steps {#steps} - ##### 1. Prepare {#1-prepare} If you have data populated on a target topic, you can adapt the following for use in your dataset. Alternatively, a sample Github dataset is provided [here](https://datasets-documentation.s3.eu-west-3.amazonaws.com/kafka/github_all_columns.ndjson). This dataset is used in the examples below and uses a reduced schema and subset of the rows (specifically, we limit to Github events concerning the [ClickHouse repository](https://github.com/ClickHouse/ClickHouse)), compared to the full dataset available [here](https://ghe.clickhouse.tech/), for brevity. This is still sufficient for most of the queries [published with the dataset](https://ghe.clickhouse.tech/) to work. - ##### 2. Configure ClickHouse {#2-configure-clickhouse} This step is required if you are connecting to a secure Kafka. These settings cannot be passed through the SQL DDL commands and must be configured in the ClickHouse config.xml. We assume you are connecting to a SASL secured instance. This is the simplest method when interacting with Confluent Cloud. - ```xml @@ -182,7 +179,6 @@ CREATE TABLE github_queue 'JSONEachRow') SETTINGS kafka_thread_per_consumer = 0, kafka_num_consumers = 1; ``` - We discuss engine settings and performance tuning below. At this point, a simple select on the table `github_queue` should read some rows. Note that this will move the consumer offsets forward, preventing these rows from being re-read without a [reset](#common-operations). Note the limit and required parameter `stream_like_engine_allow_direct_select.` ##### 6. Create the materialized view {#6-create-the-materialized-view} @@ -289,7 +285,6 @@ The result looks like: | jpn | CommitCommentEvent | 2011-02-12 12:24:31 | github | 0 | | Oxonium | CommitCommentEvent | 2011-02-12 12:31:28 | github | 0 | - ##### Modify Kafka engine settings {#modify-kafka-engine-settings} We recommend dropping the Kafka engine table and recreating it with the new settings. The materialized view does not need to be modified during this process - message consumption will resume once the Kafka engine table is recreated. @@ -340,7 +335,6 @@ Our initial objective is best illustrated: We assume you have the tables and views created under steps for [Kafka to ClickHouse](#kafka-to-clickhouse) and that the topic has been fully consumed. - ##### 1. Inserting rows directly {#1-inserting-rows-directly} First, confirm the count of the target table. @@ -473,7 +467,6 @@ Multiple ClickHouse instances can all be configured to read from a topic using t Consider the following when looking to increase Kafka Engine table throughput performance: - * The performance will vary depending on the message size, format, and target table types. 100k rows/sec on a single table engine should be considered obtainable. By default, messages are read in blocks, controlled by the parameter kafka_max_block_size. By default, this is set to the [max_insert_block_size](/operations/settings/settings#max_insert_block_size), defaulting to 1,048,576. Unless messages are extremely large, this should nearly always be increased. Values between 500k to 1M are not uncommon. Test and evaluate the effect on throughput performance. * The number of consumers for a table engine can be increased using kafka_num_consumers. However, by default, inserts will be linearized in a single thread unless kafka_thread_per_consumer is changed from the default value of 1. Set this to 1 to ensure flushes are performed in parallel. Note that creating a Kafka engine table with N consumers (and kafka_thread_per_consumer=1) is logically equivalent to creating N Kafka engines, each with a materialized view and kafka_thread_per_consumer=0. * Increasing consumers is not a free operation. Each consumer maintains its own buffers and threads, increasing the overhead on the server. Be conscious of the overhead of consumers and scale linearly across your cluster first and if possible. @@ -481,7 +474,7 @@ Consider the following when looking to increase Kafka Engine table throughput pe * [background_message_broker_schedule_pool_size](/operations/server-configuration-parameters/settings#background_message_broker_schedule_pool_size) sets the number of threads performing background tasks. These threads are used for Kafka streaming. This setting is applied at the ClickHouse server start and can't be changed in a user session, defaulting to 16. If you see timeouts in the logs, it may be appropriate to increase this. * For communication with Kafka, the librdkafka library is used, which itself creates threads. Large numbers of Kafka tables, or consumers, can thus result in large numbers of context switches. Either distribute this load across the cluster, only replicating the target tables if possible, or consider using a table engine to read from multiple topics - a list of values is supported. Multiple materialized views can be read from a single table, each filtering to the data from a specific topic. -Any settings changes should be tested. We recommend monitoring Kafka consumer lags to ensure you are properly scaled. + Any settings changes should be tested. We recommend monitoring Kafka consumer lags to ensure you are properly scaled. #### Additional settings {#additional-settings} @@ -489,14 +482,14 @@ Aside from the settings discussed above, the following may be of interest: * [Kafka_max_wait_ms](/operations/settings/settings#kafka_max_wait_ms) - The wait time in milliseconds for reading messages from Kafka before retry. Set at a user profile level and defaults to 5000. -[All settings ](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md)from the underlying librdkafka can also be placed in the ClickHouse configuration files inside a _kafka_ element - setting names should be XML elements with periods replaced with underscores e.g. + [All settings ](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md)from the underlying librdkafka can also be placed in the ClickHouse configuration files inside a _kafka_ element - setting names should be XML elements with periods replaced with underscores e.g. -```xml - - + ```xml + + false - - -``` + + + ``` -These are expert settings and we'd suggest you refer to the Kafka documentation for an in-depth explanation. + These are expert settings and we'd suggest you refer to the Kafka documentation for an in-depth explanation. diff --git a/docs/integrations/data-ingestion/kafka/kafka-vector.md b/docs/integrations/data-ingestion/kafka/kafka-vector.md index 4c09ff96be2..a28ce8fb271 100644 --- a/docs/integrations/data-ingestion/kafka/kafka-vector.md +++ b/docs/integrations/data-ingestion/kafka/kafka-vector.md @@ -23,7 +23,6 @@ Note that the current implementation of the ClickHouse sink utilizes the HTTP in ### License {#license} Vector is distributed under the [MPL-2.0 License](https://github.com/vectordotdev/vector/blob/master/LICENSE) - ### Gather your connection details {#gather-your-connection-details} @@ -31,19 +30,18 @@ Vector is distributed under the [MPL-2.0 License](https://github.com/vectordotde 1. Create the Kafka `github` topic and insert the [Github dataset](https://datasets-documentation.s3.eu-west-3.amazonaws.com/kafka/github_all_columns.ndjson). + ```bash + cat /opt/data/github/github_all_columns.ndjson | kcat -b : -X security.protocol=sasl_ssl -X sasl.mechanisms=PLAIN -X sasl.username= -X sasl.password= -t github + ``` -```bash -cat /opt/data/github/github_all_columns.ndjson | kcat -b : -X security.protocol=sasl_ssl -X sasl.mechanisms=PLAIN -X sasl.username= -X sasl.password= -t github -``` - -This dataset consists of 200,000 rows focused on the `ClickHouse/ClickHouse` repository. + This dataset consists of 200,000 rows focused on the `ClickHouse/ClickHouse` repository. 2. Ensure the target table is created. Below we use the default database. -```sql + ```sql -CREATE TABLE github -( + CREATE TABLE github + ( file_time DateTime, event_type Enum('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22), @@ -70,41 +68,41 @@ CREATE TABLE github merged_by LowCardinality(String), review_comments UInt32, member_login LowCardinality(String) -) ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at); + ) ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at); -``` + ``` 3. [Download and install Vector](https://vector.dev/docs/setup/quickstart/). Create a `kafka.toml` configuration file and modify the values for your Kafka and ClickHouse instances. -```toml -[sources.github] -type = "kafka" -auto_offset_reset = "smallest" -bootstrap_servers = ":" -group_id = "vector" -topics = [ "github" ] -tls.enabled = true -sasl.enabled = true -sasl.mechanism = "PLAIN" -sasl.username = "" -sasl.password = "" -decoding.codec = "json" - -[sinks.clickhouse] -type = "clickhouse" -inputs = ["github"] -endpoint = "http://localhost:8123" -database = "default" -table = "github" -skip_unknown_fields = true -auth.strategy = "basic" -auth.user = "username" -auth.password = "password" -buffer.max_events = 10000 -batch.timeout_secs = 1 -``` - -A few important notes on this configuration and behavior of Vector: + ```toml + [sources.github] + type = "kafka" + auto_offset_reset = "smallest" + bootstrap_servers = ":" + group_id = "vector" + topics = [ "github" ] + tls.enabled = true + sasl.enabled = true + sasl.mechanism = "PLAIN" + sasl.username = "" + sasl.password = "" + decoding.codec = "json" + + [sinks.clickhouse] + type = "clickhouse" + inputs = ["github"] + endpoint = "http://localhost:8123" + database = "default" + table = "github" + skip_unknown_fields = true + auth.strategy = "basic" + auth.user = "username" + auth.password = "password" + buffer.max_events = 10000 + batch.timeout_secs = 1 + ``` + + A few important notes on this configuration and behavior of Vector: - This example has been tested against Confluent Cloud. Therefore, the `sasl.*` and `ssl.enabled` security options may not be appropriate in self-managed cases. - A protocol prefix is not required for the configuration parameter `bootstrap_servers` e.g. `pkc-2396y.us-east-1.aws.confluent.cloud:9092` @@ -116,18 +114,18 @@ A few important notes on this configuration and behavior of Vector: 4. Start Vector -```bash -vector --config ./kafka.toml -``` + ```bash + vector --config ./kafka.toml + ``` -By default, a [health check](https://vector.dev/docs/reference/configuration/sinks/clickhouse/#healthcheck) is required before insertions begin to ClickHouse. This ensures connectivity can be established and the schema read. Prepend `VECTOR_LOG=debug`to obtain further logging which can be helpful should you encounter issues. + By default, a [health check](https://vector.dev/docs/reference/configuration/sinks/clickhouse/#healthcheck) is required before insertions begin to ClickHouse. This ensures connectivity can be established and the schema read. Prepend `VECTOR_LOG=debug`to obtain further logging which can be helpful should you encounter issues. 5. Confirm the insertion of the data. -```sql -SELECT count() AS count FROM github; -``` + ```sql + SELECT count() AS count FROM github; + ``` -| count | -| :--- | -| 200000 | + | count | + | :--- | + | 200000 | diff --git a/docs/integrations/data-ingestion/kafka/msk/index.md b/docs/integrations/data-ingestion/kafka/msk/index.md index ca17e60b88b..6706828b5a8 100644 --- a/docs/integrations/data-ingestion/kafka/msk/index.md +++ b/docs/integrations/data-ingestion/kafka/msk/index.md @@ -30,7 +30,6 @@ We assume: ## The official Kafka connector from ClickHouse with Amazon MSK {#the-official-kafka-connector-from-clickhouse-with-amazon-msk} - ### Gather your connection details {#gather-your-connection-details} @@ -43,23 +42,23 @@ We assume: 1. Install the downloaded `jar` file on [Custom plugin page](https://docs.aws.amazon.com/msk/latest/developerguide/msk-connect-plugins.html) of Amazon MSK console. 1. If Connector communicates with a public ClickHouse instance, [enable internet access](https://docs.aws.amazon.com/msk/latest/developerguide/msk-connect-internet-access.html). 1. Provide a topic name, ClickHouse instance hostname, and password in config. -```yml -connector.class=com.clickhouse.kafka.connect.ClickHouseSinkConnector -tasks.max=1 -topics= -ssl=true -security.protocol=SSL -hostname= -database= -password= -ssl.truststore.location=/tmp/kafka.client.truststore.jks -port=8443 -value.converter.schemas.enable=false -value.converter=org.apache.kafka.connect.json.JsonConverter -exactlyOnce=true -username=default -schemas.enable=false -``` + ```yml + connector.class=com.clickhouse.kafka.connect.ClickHouseSinkConnector + tasks.max=1 + topics= + ssl=true + security.protocol=SSL + hostname= + database= + password= + ssl.truststore.location=/tmp/kafka.client.truststore.jks + port=8443 + value.converter.schemas.enable=false + value.converter=org.apache.kafka.connect.json.JsonConverter + exactlyOnce=true + username=default + schemas.enable=false + ``` ## Performance tuning {#performance-tuning} One way of increasing performance is to adjust the batch size and the number of records that are fetched from Kafka by adding the following to the **worker** configuration: @@ -75,7 +74,7 @@ consumer.max.poll.records=500 consumer.max.partition.fetch.bytes=1048576 ``` -You can find more details (both implementation and other considerations) in the official [Kafka](https://kafka.apache.org/documentation/#consumerconfigs) and +You can find more details (both implementation and other considerations) in the official [Kafka](https://kafka.apache.org/documentation/#consumerconfigs) and [Amazon MSK](https://docs.aws.amazon.com/msk/latest/developerguide/msk-connect-workers.html#msk-connect-create-custom-worker-config) documentation. ## Notes on networking for MSK Connect {#notes-on-networking-for-msk-connect} @@ -85,8 +84,7 @@ In order for MSK Connect to connect to ClickHouse, we recommend your MSK cluster 1. **Create a Private Subnet:** Create a new subnet within your VPC, designating it as a private subnet. This subnet should not have direct access to the internet. 1. **Create a NAT Gateway:** Create a NAT gateway in a public subnet of your VPC. The NAT gateway enables instances in your private subnet to connect to the internet or other AWS services, but prevents the internet from initiating a connection with those instances. 1. **Update the Route Table:** Add a route that directs internet-bound traffic to the NAT gateway -1. **Ensure Security Group(s) and Network ACLs Configuration:** Configure your [security groups](https://docs.aws.amazon.com/vpc/latest/userguide/vpc-security-groups.html) and [network ACLs (Access Control Lists)](https://docs.aws.amazon.com/vpc/latest/userguide/vpc-network-acls.html) to allow relevant traffic to and from your ClickHouse instance. - 1. For ClickHouse Cloud, configure your security group to allow inbound traffic on ports 9440 and 8443. - 1. For self-hosted ClickHouse, configure your security group to allow inbound traffic on the port in your config file (default is 8123). +1. **Ensure Security Group(s) and Network ACLs Configuration:** Configure your [security groups](https://docs.aws.amazon.com/vpc/latest/userguide/vpc-security-groups.html) and [network ACLs (Access Control Lists)](https://docs.aws.amazon.com/vpc/latest/userguide/vpc-network-acls.html) to allow relevant traffic to and from your ClickHouse instance. + 1. For ClickHouse Cloud, configure your security group to allow inbound traffic on ports 9440 and 8443. + 1. For self-hosted ClickHouse, configure your security group to allow inbound traffic on the port in your config file (default is 8123). 1. **Attach Security Group(s) to MSK:** Ensure that these new security groups routed to the NAT gateways are attached to your MSK cluster - diff --git a/docs/integrations/data-ingestion/redshift/index.md b/docs/integrations/data-ingestion/redshift/index.md index 5c013339e38..ba51c5d5f30 100644 --- a/docs/integrations/data-ingestion/redshift/index.md +++ b/docs/integrations/data-ingestion/redshift/index.md @@ -45,10 +45,9 @@ From the ClickHouse instance standpoint, you can either: 3. **[PIVOT](#pivot-data-from-redshift-to-clickhouse-using-s3)** using S3 object storage using an "Unload then load" logic -:::note -We used Redshift as a data source in this tutorial. However, the migration approaches presented here are not exclusive to Redshift, and similar steps can be derived for any compatible data source. -::: - + :::note + We used Redshift as a data source in this tutorial. However, the migration approaches presented here are not exclusive to Redshift, and similar steps can be derived for any compatible data source. + ::: ## Push data from Redshift to ClickHouse {#push-data-from-redshift-to-clickhouse} @@ -67,7 +66,6 @@ In the push scenario, the idea is to leverage a third-party tool or service (eit * Users need to set up and maintain an ETL/ELT infrastructure. * Introduces a third-party element in the architecture which can turn into a potential scalability bottleneck. - ## Pull data from Redshift to ClickHouse {#pull-data-from-redshift-to-clickhouse} In the pull scenario, the idea is to leverage the ClickHouse JDBC Bridge to connect to a Redshift cluster directly from a ClickHouse instance and perform `INSERT INTO ... SELECT` queries: @@ -83,26 +81,24 @@ In the pull scenario, the idea is to leverage the ClickHouse JDBC Bridge to conn * Requires a ClickHouse JDBC Bridge instance which can turn into a potential scalability bottleneck - -:::note -Even though Redshift is based on PostgreSQL, using the ClickHouse PostgreSQL table function or table engine is not possible since ClickHouse requires PostgreSQL version 9 or above and the Redshift API is based on an earlier version (8.x). -::: + :::note + Even though Redshift is based on PostgreSQL, using the ClickHouse PostgreSQL table function or table engine is not possible since ClickHouse requires PostgreSQL version 9 or above and the Redshift API is based on an earlier version (8.x). + ::: ### Tutorial {#tutorial} To use this option, you need to set up a ClickHouse JDBC Bridge. ClickHouse JDBC Bridge is a standalone Java application that handles JDBC connectivity and acts as a proxy between the ClickHouse instance and the data sources. For this tutorial, we used a pre-populated Redshift instance with a [sample database](https://docs.aws.amazon.com/redshift/latest/dg/c_sampledb.html). - 1. Deploy the ClickHouse JDBC Bridge. For more details, see our user guide on [JDBC for External Data sources](/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md) -:::note -If you are using ClickHouse Cloud, you will need to run your ClickHouse JDBC Bridge on a separate environment and connect to ClickHouse Cloud using the [remoteSecure](/sql-reference/table-functions/remote/) function -::: + :::note + If you are using ClickHouse Cloud, you will need to run your ClickHouse JDBC Bridge on a separate environment and connect to ClickHouse Cloud using the [remoteSecure](/sql-reference/table-functions/remote/) function + ::: 2. Configure your Redshift datasource for ClickHouse JDBC Bridge. For example, `/etc/clickhouse-jdbc-bridge/config/datasources/redshift.json ` - ```json - { + ```json + { "redshift-server": { "aliases": [ "redshift" @@ -116,81 +112,80 @@ If you are using ClickHouse Cloud, you will need to run your ClickHouse JDBC Bri "password": "", "maximumPoolSize": 5 } - } - ``` + } + ``` 3. Once ClickHouse JDBC Bridge deployed and running, you can start querying your Redshift instance from ClickHouse - ```sql - SELECT * - FROM jdbc('redshift', 'select username, firstname, lastname from users limit 5') - ``` - - ```response - Query id: 1b7de211-c0f6-4117-86a2-276484f9f4c0 + ```sql + SELECT * + FROM jdbc('redshift', 'select username, firstname, lastname from users limit 5') + ``` - ┌─username─┬─firstname─┬─lastname─┐ - │ PGL08LJI │ Vladimir │ Humphrey │ - │ XDZ38RDD │ Barry │ Roy │ - │ AEB55QTM │ Reagan │ Hodge │ - │ OWY35QYB │ Tamekah │ Juarez │ - │ MSD36KVR │ Mufutau │ Watkins │ - └──────────┴───────────┴──────────┘ + ```response + Query id: 1b7de211-c0f6-4117-86a2-276484f9f4c0 - 5 rows in set. Elapsed: 0.438 sec. - ``` + ┌─username─┬─firstname─┬─lastname─┐ + │ PGL08LJI │ Vladimir │ Humphrey │ + │ XDZ38RDD │ Barry │ Roy │ + │ AEB55QTM │ Reagan │ Hodge │ + │ OWY35QYB │ Tamekah │ Juarez │ + │ MSD36KVR │ Mufutau │ Watkins │ + └──────────┴───────────┴──────────┘ - ```sql - SELECT * - FROM jdbc('redshift', 'select count(*) from sales') - ``` + 5 rows in set. Elapsed: 0.438 sec. + ``` - ```response - Query id: 2d0f957c-8f4e-43b2-a66a-cc48cc96237b + ```sql + SELECT * + FROM jdbc('redshift', 'select count(*) from sales') + ``` - ┌──count─┐ - │ 172456 │ - └────────┘ + ```response + Query id: 2d0f957c-8f4e-43b2-a66a-cc48cc96237b - 1 rows in set. Elapsed: 0.304 sec. - ``` + ┌──count─┐ + │ 172456 │ + └────────┘ + 1 rows in set. Elapsed: 0.304 sec. + ``` 4. In the following, we display importing data using an `INSERT INTO ... SELECT` statement - ```sql - # TABLE CREATION with 3 columns - CREATE TABLE users_imported - ( - `username` String, - `firstname` String, - `lastname` String - ) - ENGINE = MergeTree - ORDER BY firstname - ``` + ```sql + # TABLE CREATION with 3 columns + CREATE TABLE users_imported + ( + `username` String, + `firstname` String, + `lastname` String + ) + ENGINE = MergeTree + ORDER BY firstname + ``` - ```response - Query id: c7c4c44b-cdb2-49cf-b319-4e569976ab05 + ```response + Query id: c7c4c44b-cdb2-49cf-b319-4e569976ab05 - Ok. + Ok. - 0 rows in set. Elapsed: 0.233 sec. - ``` + 0 rows in set. Elapsed: 0.233 sec. + ``` - ```sql - # IMPORTING DATA - INSERT INTO users_imported (*) SELECT * - FROM jdbc('redshift', 'select username, firstname, lastname from users') - ``` + ```sql + # IMPORTING DATA + INSERT INTO users_imported (*) SELECT * + FROM jdbc('redshift', 'select username, firstname, lastname from users') + ``` - ```response - Query id: 9d3a688d-b45a-40f4-a7c7-97d93d7149f1 + ```response + Query id: 9d3a688d-b45a-40f4-a7c7-97d93d7149f1 - Ok. + Ok. - 0 rows in set. Elapsed: 4.498 sec. Processed 49.99 thousand rows, 2.49 MB (11.11 thousand rows/s., 554.27 KB/s.) - ``` + 0 rows in set. Elapsed: 4.498 sec. Processed 49.99 thousand rows, 2.49 MB (11.11 thousand rows/s., 554.27 KB/s.) + ``` ## Pivot data from Redshift to ClickHouse using S3 {#pivot-data-from-redshift-to-clickhouse-using-s3} @@ -257,6 +252,6 @@ In this scenario, we export data to S3 in an intermediary pivot format and, in a 0 rows in set. Elapsed: 0.545 sec. Processed 49.99 thousand rows, 2.34 MB (91.72 thousand rows/s., 4.30 MB/s.) ``` -:::note -This example used CSV as the pivot format. However, for production workloads we recommend Apache Parquet as the best option for large migrations since it comes with compression and can save some storage costs while reducing transfer times. (By default, each row group is compressed using SNAPPY). ClickHouse also leverages Parquet's column orientation to speed up data ingestion. -::: + :::note + This example used CSV as the pivot format. However, for production workloads we recommend Apache Parquet as the best option for large migrations since it comes with compression and can save some storage costs while reducing transfer times. (By default, each row group is compressed using SNAPPY). ClickHouse also leverages Parquet's column orientation to speed up data ingestion. + ::: diff --git a/docs/integrations/data-ingestion/s3-minio.md b/docs/integrations/data-ingestion/s3-minio.md index 4633cbd66e2..c40e23dbac2 100644 --- a/docs/integrations/data-ingestion/s3-minio.md +++ b/docs/integrations/data-ingestion/s3-minio.md @@ -12,7 +12,6 @@ import SelfManaged from '@site/docs/_snippets/_self_managed_only_no_roadmap.md'; - All S3 functions and tables and compatible with [MinIO](https://min.io/). Users may experience superior throughput on self-hosted MinIO stores, especially in the event of optimal network locality. Also backed merge tree configuration is compatible too, with some minor changes in configuration: diff --git a/docs/integrations/data-ingestion/s3/index.md b/docs/integrations/data-ingestion/s3/index.md index d559e343ebb..dd12b9a142a 100644 --- a/docs/integrations/data-ingestion/s3/index.md +++ b/docs/integrations/data-ingestion/s3/index.md @@ -31,7 +31,7 @@ where: * structure — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. * compression — Parameter is optional. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. -Using wildcards in the path expression allow multiple files to be referenced and opens the door for parallelism. + Using wildcards in the path expression allow multiple files to be referenced and opens the door for parallelism. ### Preparation {#preparation} @@ -153,7 +153,6 @@ ORDER BY pickup_datetime Note the use of [partitioning](/engines/table-engines/mergetree-family/custom-partitioning-key) on the `pickup_date` field. Usually a partition key is for data management, but later on we will use this key to parallelize writes to S3. - Each entry in our taxi dataset contains a taxi trip. This anonymized data consists of 20M records compressed in the S3 bucket https://datasets-documentation.s3.eu-west-3.amazonaws.com/ under the folder **nyc-taxi**. The data is in the TSV format with approximately 1M rows per file. ### Reading Data from S3 {#reading-data-from-s3} @@ -188,7 +187,6 @@ LIMIT 5; Confirm the number of rows in this sample dataset. Note the use of wildcards for file expansion, so we consider all twenty files. This query will take around 10 seconds, depending on the number of cores on the ClickHouse instance: - ```sql SELECT count() AS count FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_*.gz', 'TabSeparatedWithNames'); @@ -215,7 +213,6 @@ clickhouse-local --query "SELECT * FROM s3('https://datasets-documentation.s3.eu To exploit the full capabilities of ClickHouse, we next read and insert the data into our instance. We combine our `s3` function with a simple `INSERT` statement to achieve this. Note that we aren't required to list our columns because our target table provides the required structure. This requires the columns to appear in the order specified in the table DDL statement: columns are mapped according to their position in the `SELECT` clause. The insertion of all 10m rows can take a few minutes depending on the ClickHouse instance. Below we insert 1M rows to ensure a prompt response. Adjust the `LIMIT` clause or column selection to import subsets as required: - ```sql INSERT INTO trips SELECT * @@ -262,7 +259,6 @@ It is unlikely you will want to export your data as a single file. Most tools, i In the example below, we create ten files using a modulus of the `rand()` function. Notice how the resulting partition ID is referenced in the filename. This results in ten files with a numerical suffix, e.g. `trips_0.csv.lz4`, `trips_1.csv.lz4` etc...: - ```sql INSERT INTO FUNCTION s3( @@ -311,23 +307,21 @@ s3Cluster(cluster_name, source, [access_key_id, secret_access_key,] format, stru * `format` — The [format](/interfaces/formats#formats-overview) of the file. * `structure` — Structure of the table. Format 'column1_name column1_type, column2_name column2_type, ...'. + Like any `s3` functions, the credentials are optional if the bucket is insecure or you define security through the environment, e.g., IAM roles. Unlike the s3 function, however, the structure must be specified in the request as of 22.3.1, i.e., the schema is not inferred. -Like any `s3` functions, the credentials are optional if the bucket is insecure or you define security through the environment, e.g., IAM roles. Unlike the s3 function, however, the structure must be specified in the request as of 22.3.1, i.e., the schema is not inferred. + This function will be used as part of an `INSERT INTO SELECT` in most cases. In this case, you will often be inserting a distributed table. We illustrate a simple example below where trips_all is a distributed table. While this table uses the events cluster, the consistency of the nodes used for reads and writes is not a requirement: -This function will be used as part of an `INSERT INTO SELECT` in most cases. In this case, you will often be inserting a distributed table. We illustrate a simple example below where trips_all is a distributed table. While this table uses the events cluster, the consistency of the nodes used for reads and writes is not a requirement: - -```sql -INSERT INTO default.trips_all - SELECT * - FROM s3Cluster( + ```sql + INSERT INTO default.trips_all + SELECT * + FROM s3Cluster( 'events', 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_*.gz', 'TabSeparatedWithNames' ) -``` - -Inserts will occur against the initiator node. This means that while reads will occur on each node, the resulting rows will be routed to the initiator for distribution. In high throughput scenarios, this may prove a bottleneck. To address this, set the parameter [parallel_distributed_insert_select](/operations/settings/settings/#parallel_distributed_insert_select) for the `s3cluster` function. + ``` + Inserts will occur against the initiator node. This means that while reads will occur on each node, the resulting rows will be routed to the initiator for distribution. In high throughput scenarios, this may prove a bottleneck. To address this, set the parameter [parallel_distributed_insert_select](/operations/settings/settings/#parallel_distributed_insert_select) for the `s3cluster` function. ## S3 table engines {#s3-table-engines} @@ -348,7 +342,6 @@ CREATE TABLE s3_engine_table (name String, value UInt32) In the following example, we create a table named `trips_raw` using the first ten TSV files located in the `https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/` bucket. Each of these contains 1M rows each: - ```sql CREATE TABLE trips_raw ( @@ -471,9 +464,9 @@ Note that rows can only be inserted into new files. There are no merge cycles or * Specify the setting `s3_create_new_file_on_insert=1`. This will cause the creation of new files on each insert. A numeric suffix will be appended to the end of each file that will monotonically increase for each insert operation. For the above example, a subsequent insert would cause the creation of a trips_1.bin file. * Specify the setting `s3_truncate_on_insert=1`. This will cause a truncation of the file, i.e. it will only contain the newly inserted rows once complete. -Both of these settings default to 0 - thus forcing the user to set one of them. `s3_truncate_on_insert` will take precedence if both are set. + Both of these settings default to 0 - thus forcing the user to set one of them. `s3_truncate_on_insert` will take precedence if both are set. -Some notes about the `S3` table engine: + Some notes about the `S3` table engine: - Unlike a traditional `MergeTree` family table, dropping an `S3` table will not delete the underlying data. - Full settings for this table type can be found [here](/engines/table-engines/integrations/s3.md/#settings). @@ -517,12 +510,12 @@ In the previous examples, we have passed credentials in the `s3` function or `S3 This setting turns on an attempt to retrieve S3 credentials from the environment, thus allowing access through IAM roles. Specifically, the following order of retrieval is performed: - * A lookup for the environment variables `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_SESSION_TOKEN` - * Check performed in **$HOME/.aws** - * Temporary credentials obtained via the AWS Security Token Service - i.e. via [`AssumeRole`](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html) API - * Checks for credentials in the ECS environment variables `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` or `AWS_CONTAINER_CREDENTIALS_FULL_URI` and `AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN`. - * Obtains the credentials via [Amazon EC2 instance metadata](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-metadata.html) provided [AWS_EC2_METADATA_DISABLED](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html#envvars-list-AWS_EC2_METADATA_DISABLED) is not set to true. - * These same settings can also be set for a specific endpoint, using the same prefix matching rule. + * A lookup for the environment variables `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_SESSION_TOKEN` + * Check performed in **$HOME/.aws** + * Temporary credentials obtained via the AWS Security Token Service - i.e. via [`AssumeRole`](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html) API + * Checks for credentials in the ECS environment variables `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` or `AWS_CONTAINER_CREDENTIALS_FULL_URI` and `AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN`. + * Obtains the credentials via [Amazon EC2 instance metadata](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-metadata.html) provided [AWS_EC2_METADATA_DISABLED](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html#envvars-list-AWS_EC2_METADATA_DISABLED) is not set to true. + * These same settings can also be set for a specific endpoint, using the same prefix matching rule. ## Optimizing for performance {#s3-optimizing-performance} @@ -686,7 +679,6 @@ The following notes cover the implementation of S3 interactions with ClickHouse. * Reads on S3 are asynchronous by default. This behavior is determined by setting `remote_filesystem_read_method`, set to the value `threadpool` by default. When serving a request, ClickHouse reads granules in stripes. Each of these stripes potentially contain many columns. A thread will read the columns for their granules one by one. Rather than doing this synchronously, a prefetch is made for all columns before waiting for the data. This offers significant performance improvements over synchronous waits on each column. Users will not need to change this setting in most cases - see [Optimizing for Performance](#s3-optimizing-performance). * Writes are performed in parallel, with a maximum of 100 concurrent file writing threads. `max_insert_delayed_streams_for_parallel_write`, which has a default value of 1000, controls the number of S3 blobs written in parallel. Since a buffer is required for each file being written (~1MB), this effectively limits the memory consumption of an INSERT. It may be appropriate to lower this value in low server memory scenarios. - ## Use S3 object storage as a ClickHouse disk {#configuring-s3-for-clickhouse-use} If you need step-by-step instructions to create buckets and an IAM role, then expand **Create S3 buckets and an IAM role** and follow along: @@ -696,14 +688,14 @@ If you need step-by-step instructions to create buckets and an IAM role, then ex ### Configure ClickHouse to use the S3 bucket as a disk {#configure-clickhouse-to-use-the-s3-bucket-as-a-disk} The following example is based on a Linux Deb package installed as a service with default ClickHouse directories. -1. Create a new file in the ClickHouse `config.d` directory to store the storage configuration. -```bash -vim /etc/clickhouse-server/config.d/storage_config.xml -``` +1. Create a new file in the ClickHouse `config.d` directory to store the storage configuration. + ```bash + vim /etc/clickhouse-server/config.d/storage_config.xml + ``` 2. Add the following for storage configuration; substituting the bucket path, access key and secret keys from earlier steps -```xml - - + ```xml + + s3 @@ -728,37 +720,37 @@ vim /etc/clickhouse-server/config.d/storage_config.xml - - -``` + + + ``` -:::note -The tags `s3_disk` and `s3_cache` within the `` tag are arbitrary labels. These can be set to something else but the same label must be used in the `` tab under the `` tab to reference the disk. -The `` tag is also arbitrary and is the name of the policy which will be used as the identifier storage target when creating resources in ClickHouse. + :::note + The tags `s3_disk` and `s3_cache` within the `` tag are arbitrary labels. These can be set to something else but the same label must be used in the `` tab under the `` tab to reference the disk. + The `` tag is also arbitrary and is the name of the policy which will be used as the identifier storage target when creating resources in ClickHouse. -The configuration shown above is for ClickHouse version 22.8 or higher, if you are using an older version please see the [storing data](/operations/storing-data.md/#using-local-cache) docs. + The configuration shown above is for ClickHouse version 22.8 or higher, if you are using an older version please see the [storing data](/operations/storing-data.md/#using-local-cache) docs. -For more information about using S3: -Integrations Guide: [S3 Backed MergeTree](#s3-backed-mergetree) -::: + For more information about using S3: + Integrations Guide: [S3 Backed MergeTree](#s3-backed-mergetree) + ::: 3. Update the owner of the file to the `clickhouse` user and group -```bash -chown clickhouse:clickhouse /etc/clickhouse-server/config.d/storage_config.xml -``` + ```bash + chown clickhouse:clickhouse /etc/clickhouse-server/config.d/storage_config.xml + ``` 4. Restart the ClickHouse instance to have the changes take effect. -```bash -service clickhouse-server restart -``` + ```bash + service clickhouse-server restart + ``` ### Testing {#testing} 1. Log in with the ClickHouse client, something like the following -```bash -clickhouse-client --user default --password ClickHouse123! -``` + ```bash + clickhouse-client --user default --password ClickHouse123! + ``` 2. Create a table specifying the new S3 storage policy -```sql -CREATE TABLE s3_table1 + ```sql + CREATE TABLE s3_table1 ( `id` UInt64, `column1` String @@ -766,58 +758,58 @@ CREATE TABLE s3_table1 ENGINE = MergeTree ORDER BY id SETTINGS storage_policy = 's3_main'; -``` + ``` 3. Show that the table was created with the correct policy -```sql -SHOW CREATE TABLE s3_table1; -``` -```response -┌─statement──────────────────────────────────────────────────── -│ CREATE TABLE default.s3_table1 -( + ```sql + SHOW CREATE TABLE s3_table1; + ``` + ```response + ┌─statement──────────────────────────────────────────────────── + │ CREATE TABLE default.s3_table1 + ( `id` UInt64, `column1` String -) -ENGINE = MergeTree -ORDER BY id -SETTINGS storage_policy = 's3_main', index_granularity = 8192 -└────────────────────────────────────────────────────────────── -``` + ) + ENGINE = MergeTree + ORDER BY id + SETTINGS storage_policy = 's3_main', index_granularity = 8192 + └────────────────────────────────────────────────────────────── + ``` 4. Insert test rows into the table -```sql -INSERT INTO s3_table1 + ```sql + INSERT INTO s3_table1 (id, column1) VALUES (1, 'abc'), (2, 'xyz'); -``` -```response -INSERT INTO s3_table1 (id, column1) FORMAT Values + ``` + ```response + INSERT INTO s3_table1 (id, column1) FORMAT Values -Query id: 0265dd92-3890-4d56-9d12-71d4038b85d5 + Query id: 0265dd92-3890-4d56-9d12-71d4038b85d5 -Ok. + Ok. -2 rows in set. Elapsed: 0.337 sec. -``` + 2 rows in set. Elapsed: 0.337 sec. + ``` 5. View the rows -```sql -SELECT * FROM s3_table1; -``` -```response -┌─id─┬─column1─┐ -│ 1 │ abc │ -│ 2 │ xyz │ -└────┴─────────┘ + ```sql + SELECT * FROM s3_table1; + ``` + ```response + ┌─id─┬─column1─┐ + │ 1 │ abc │ + │ 2 │ xyz │ + └────┴─────────┘ -2 rows in set. Elapsed: 0.284 sec. -``` -6. In the AWS console, navigate to the buckets, and select the new one and the folder. -You should see something like the following: + 2 rows in set. Elapsed: 0.284 sec. + ``` +6. In the AWS console, navigate to the buckets, and select the new one and the folder. + You should see something like the following: - + ## Replicating a single shard across two AWS regions using S3 Object Storage {#s3-multi-region} @@ -1009,7 +1001,6 @@ This setting should be false for two reasons: 1) this feature is not production ``` - ClickHouse Keeper is responsible for coordinating the replication of data across the ClickHouse nodes. To inform ClickHouse about the ClickHouse Keeper nodes add a configuration file to each of the ClickHouse nodes. ```xml title="/etc/clickhouse-server/config.d/use_keeper.xml" @@ -1100,59 +1091,59 @@ sudo service clickhouse-server start When you added the [cluster configuration](#define-a-cluster) a single shard replicated across the two ClickHouse nodes was defined. In this verification step you will check that the cluster was built when ClickHouse was started, and you will create a replicated table using that cluster. - Verify that the cluster exists: - ```sql - show clusters - ``` - ```response - ┌─cluster───────┐ - │ cluster_1S_2R │ - └───────────────┘ + ```sql + show clusters + ``` + ```response + ┌─cluster───────┐ + │ cluster_1S_2R │ + └───────────────┘ - 1 row in set. Elapsed: 0.009 sec. ` - ``` + 1 row in set. Elapsed: 0.009 sec. ` + ``` - Create a table in the cluster using the `ReplicatedMergeTree` table engine: - ```sql - create table trips on cluster 'cluster_1S_2R' ( - `trip_id` UInt32, - `pickup_date` Date, - `pickup_datetime` DateTime, - `dropoff_datetime` DateTime, - `pickup_longitude` Float64, - `pickup_latitude` Float64, - `dropoff_longitude` Float64, - `dropoff_latitude` Float64, - `passenger_count` UInt8, - `trip_distance` Float64, - `tip_amount` Float32, - `total_amount` Float32, - `payment_type` Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4)) - ENGINE = ReplicatedMergeTree - PARTITION BY toYYYYMM(pickup_date) - ORDER BY pickup_datetime - SETTINGS storage_policy='s3_main' - ``` - ```response - ┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ - │ chnode1 │ 9000 │ 0 │ │ 1 │ 0 │ - │ chnode2 │ 9000 │ 0 │ │ 0 │ 0 │ - └─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ - ``` + ```sql + create table trips on cluster 'cluster_1S_2R' ( + `trip_id` UInt32, + `pickup_date` Date, + `pickup_datetime` DateTime, + `dropoff_datetime` DateTime, + `pickup_longitude` Float64, + `pickup_latitude` Float64, + `dropoff_longitude` Float64, + `dropoff_latitude` Float64, + `passenger_count` UInt8, + `trip_distance` Float64, + `tip_amount` Float32, + `total_amount` Float32, + `payment_type` Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4)) + ENGINE = ReplicatedMergeTree + PARTITION BY toYYYYMM(pickup_date) + ORDER BY pickup_datetime + SETTINGS storage_policy='s3_main' + ``` + ```response + ┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ + │ chnode1 │ 9000 │ 0 │ │ 1 │ 0 │ + │ chnode2 │ 9000 │ 0 │ │ 0 │ 0 │ + └─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘ + ``` - Understand the use of the macros defined earlier - The macros `shard`, and `replica` were [defined earlier](#define-a-cluster), and in the highlighted line below you can see where the values are substituted on each ClickHouse node. Additionally, the value `uuid` is used; `uuid` is not defined in the macros as it is generated by the system. - ```sql - SELECT create_table_query - FROM system.tables - WHERE name = 'trips' - FORMAT Vertical - ``` - ```response - Query id: 4d326b66-0402-4c14-9c2f-212bedd282c0 + The macros `shard`, and `replica` were [defined earlier](#define-a-cluster), and in the highlighted line below you can see where the values are substituted on each ClickHouse node. Additionally, the value `uuid` is used; `uuid` is not defined in the macros as it is generated by the system. + ```sql + SELECT create_table_query + FROM system.tables + WHERE name = 'trips' + FORMAT Vertical + ``` + ```response + Query id: 4d326b66-0402-4c14-9c2f-212bedd282c0 - Row 1: - ────── - create_table_query: CREATE TABLE default.trips (`trip_id` UInt32, `pickup_date` Date, `pickup_datetime` DateTime, `dropoff_datetime` DateTime, `pickup_longitude` Float64, `pickup_latitude` Float64, `dropoff_longitude` Float64, `dropoff_latitude` Float64, `passenger_count` UInt8, `trip_distance` Float64, `tip_amount` Float32, `total_amount` Float32, `payment_type` Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4)) + Row 1: + ────── + create_table_query: CREATE TABLE default.trips (`trip_id` UInt32, `pickup_date` Date, `pickup_datetime` DateTime, `dropoff_datetime` DateTime, `pickup_longitude` Float64, `pickup_latitude` Float64, `dropoff_longitude` Float64, `dropoff_latitude` Float64, `passenger_count` UInt8, `trip_distance` Float64, `tip_amount` Float32, `total_amount` Float32, `payment_type` Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4)) # highlight-next-line ENGINE = ReplicatedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}') PARTITION BY toYYYYMM(pickup_date) ORDER BY pickup_datetime SETTINGS storage_policy = 's3_main' @@ -1168,9 +1159,9 @@ When you added the [cluster configuration](#define-a-cluster) a single shard rep These tests will verify that data is being replicated across the two servers, and that it is stored in the S3 Buckets and not on local disk. - Add data from the New York City taxi dataset: - ```sql - INSERT INTO trips - SELECT trip_id, + ```sql + INSERT INTO trips + SELECT trip_id, pickup_date, pickup_datetime, dropoff_datetime, @@ -1184,46 +1175,46 @@ These tests will verify that data is being replicated across the two servers, an total_amount, payment_type FROM s3('https://ch-nyc-taxi.s3.eu-west-3.amazonaws.com/tsv/trips_{0..9}.tsv.gz', 'TabSeparatedWithNames') LIMIT 1000000; - ``` + ``` - Verify that data is stored in S3. - This query shows the size of the data on disk, and the policy used to determine which disk is used. - ```sql - SELECT + This query shows the size of the data on disk, and the policy used to determine which disk is used. + ```sql + SELECT engine, data_paths, metadata_path, storage_policy, formatReadableSize(total_bytes) - FROM system.tables - WHERE name = 'trips' - FORMAT Vertical - ``` - ```response - Query id: af7a3d1b-7730-49e0-9314-cc51c4cf053c - - Row 1: - ────── - engine: ReplicatedMergeTree - data_paths: ['/var/lib/clickhouse/disks/s3_disk/store/551/551a859d-ec2d-4512-9554-3a4e60782853/'] - metadata_path: /var/lib/clickhouse/store/e18/e18d3538-4c43-43d9-b083-4d8e0f390cf7/trips.sql - storage_policy: s3_main - formatReadableSize(total_bytes): 36.42 MiB - - 1 row in set. Elapsed: 0.009 sec. - ``` + FROM system.tables + WHERE name = 'trips' + FORMAT Vertical + ``` + ```response + Query id: af7a3d1b-7730-49e0-9314-cc51c4cf053c + + Row 1: + ────── + engine: ReplicatedMergeTree + data_paths: ['/var/lib/clickhouse/disks/s3_disk/store/551/551a859d-ec2d-4512-9554-3a4e60782853/'] + metadata_path: /var/lib/clickhouse/store/e18/e18d3538-4c43-43d9-b083-4d8e0f390cf7/trips.sql + storage_policy: s3_main + formatReadableSize(total_bytes): 36.42 MiB + + 1 row in set. Elapsed: 0.009 sec. + ``` - Check the size of data on the local disk. From above, the size on disk for the millions of rows stored is 36.42 MiB. This should be on S3, and not the local disk. The query above also tells us where on local disk data and metadata is stored. Check the local data: - ```response - root@chnode1:~# du -sh /var/lib/clickhouse/disks/s3_disk/store/551 - 536K /var/lib/clickhouse/disks/s3_disk/store/551 - ``` + Check the size of data on the local disk. From above, the size on disk for the millions of rows stored is 36.42 MiB. This should be on S3, and not the local disk. The query above also tells us where on local disk data and metadata is stored. Check the local data: + ```response + root@chnode1:~# du -sh /var/lib/clickhouse/disks/s3_disk/store/551 + 536K /var/lib/clickhouse/disks/s3_disk/store/551 + ``` - Check the S3 data in each S3 Bucket (the totals are not shown, but both buckets have approximately 36 MiB stored after the inserts): + Check the S3 data in each S3 Bucket (the totals are not shown, but both buckets have approximately 36 MiB stored after the inserts): - + - + ## S3Express {#s3express} @@ -1243,10 +1234,10 @@ Creating a table with storage backed by a S3Express bucket involves the followin 2. Install appropriate bucket policy to grant all required permissions to your S3 user (e.g. `"Action": "s3express:*"` to simply allow unrestricted access) 3. When configuring the storage policy please provide the `region` parameter -Storage configuration is the same as for ordinary S3 and for example might look the following way: + Storage configuration is the same as for ordinary S3 and for example might look the following way: -``` sql - + ``` sql + s3 @@ -1265,21 +1256,21 @@ Storage configuration is the same as for ordinary S3 and for example might look - -``` + + ``` -And then create a table on the new storage: + And then create a table on the new storage: -``` sql -CREATE TABLE t -( + ``` sql + CREATE TABLE t + ( a UInt64, s String -) -ENGINE = MergeTree -ORDER BY a -SETTINGS storage_policy = 's3_express'; -``` + ) + ENGINE = MergeTree + ORDER BY a + SETTINGS storage_policy = 's3_express'; + ``` ### S3 storage {#s3-storage} diff --git a/docs/integrations/data-ingestion/s3/performance.md b/docs/integrations/data-ingestion/s3/performance.md index dac4ec04dd5..d6ac3cf3d63 100644 --- a/docs/integrations/data-ingestion/s3/performance.md +++ b/docs/integrations/data-ingestion/s3/performance.md @@ -15,7 +15,7 @@ import InsertThreads from '@site/static/images/integrations/data-ingestion/s3/in import S3Cluster from '@site/static/images/integrations/data-ingestion/s3/s3Cluster.png'; import HardwareSize from '@site/static/images/integrations/data-ingestion/s3/hardware_size.png'; -This section focuses on optimizing performance when reading and inserting data from S3 using the [s3 table functions](/sql-reference/table-functions/s3). +This section focuses on optimizing performance when reading and inserting data from S3 using the [s3 table functions](/sql-reference/table-functions/s3). :::info **The lesson described in this guide can be applied to other object storage implementations with their own dedicated table functions such as [GCS](/sql-reference/table-functions/gcs) and [Azure Blob storage](/sql-reference/table-functions/azureBlobStorage).** @@ -35,7 +35,7 @@ When performing an `INSERT INTO SELECT`, ClickHouse receives some data portion, The insert block size impacts both the [disk file I/O usage](https://en.wikipedia.org/wiki/Category:Disk_file_systems) and memory usage of a ClickHouse server. Larger insert blocks use more memory but generate larger and fewer initial parts. The fewer parts ClickHouse needs to create for loading a large amount of data, the less disk file I/O and automatic [background merges required](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part1#more-parts--more-background-part-merges). -When using an `INSERT INTO SELECT` query in combination with an integration table engine or a table function, the data is pulled by the ClickHouse server: +When using an `INSERT INTO SELECT` query in combination with an integration table engine or a table function, the data is pulled by the ClickHouse server: @@ -46,7 +46,7 @@ Until the data is completely loaded, the server executes a loop: ② Write the block into a new part on storage. -Go to ① +Go to ① ``` In ①, the size depends on the insert block size, which can be controlled with two settings: @@ -54,13 +54,13 @@ In ①, the size depends on the insert block size, which can be controlled with - [`min_insert_block_size_rows`](/operations/settings/settings#min_insert_block_size_rows) (default: `1048545` million rows) - [`min_insert_block_size_bytes`](/operations/settings/settings#min_insert_block_size_bytes) (default: `256 MiB`) -When either the specified number of rows is collected in the insert block, or the configured amount of data is reached (whichever happens first), then this will trigger the block being written into a new part. The insert loop continues at step ①. + When either the specified number of rows is collected in the insert block, or the configured amount of data is reached (whichever happens first), then this will trigger the block being written into a new part. The insert loop continues at step ①. -Note that the `min_insert_block_size_bytes` value denotes the uncompressed in-memory block size (and not the compressed on-disk part size). Also, note that the created blocks and parts rarely precisely contain the configured number of rows or bytes because ClickHouse streams and [processes](https://clickhouse.com/company/events/query-performance-introspection) data row-[block](/operations/settings/settings#max_block_size)-wise. Therefore, these settings specify minimum thresholds. + Note that the `min_insert_block_size_bytes` value denotes the uncompressed in-memory block size (and not the compressed on-disk part size). Also, note that the created blocks and parts rarely precisely contain the configured number of rows or bytes because ClickHouse streams and [processes](https://clickhouse.com/company/events/query-performance-introspection) data row-[block](/operations/settings/settings#max_block_size)-wise. Therefore, these settings specify minimum thresholds. #### Be aware of merges {#be-aware-of-merges} -The smaller the configured insert block size is, the more initial parts get created for a large data load, and the more background part merges are executed concurrently with the data ingestion. This can cause resource contention (CPU and memory) and require additional time (for reaching a [healthy](/operations/settings/merge-tree-settings#parts_to_throw_insert) (3000) number of parts) after the ingestion is finished. +The smaller the configured insert block size is, the more initial parts get created for a large data load, and the more background part merges are executed concurrently with the data ingestion. This can cause resource contention (CPU and memory) and require additional time (for reaching a [healthy](/operations/settings/merge-tree-settings#parts_to_throw_insert) (3000) number of parts) after the ingestion is finished. :::important ClickHouse query performance will be negatively impacted if the part count exceeds the [recommended limits](/operations/settings/merge-tree-settings#parts_to_throw_insert). @@ -92,23 +92,23 @@ Parts that were merged into larger parts are marked as [inactive](/operations/sy A ClickHouse server can process and insert data in parallel. The level of insert parallelism impacts the ingest throughput and memory usage of a ClickHouse server. Loading and processing data in parallel requires more main memory but increases the ingest throughput as data is processed faster. -Table functions like s3 allow specifying sets of to-be-loaded-file names via glob patterns. When a glob pattern matches multiple existing files, ClickHouse can parallelize reads across and within these files and insert the data in parallel into a table by utilizing parallel running insert threads (per server): +Table functions like s3 allow specifying sets of to-be-loaded-file names via glob patterns. When a glob pattern matches multiple existing files, ClickHouse can parallelize reads across and within these files and insert the data in parallel into a table by utilizing parallel running insert threads (per server): -Until all data from all files is processed, each insert thread executes a loop: +Until all data from all files is processed, each insert thread executes a loop: ```bash ① Get the next portion of unprocessed file data (portion size is based on the configured block size) and create an in-memory data block from it. ② Write the block into a new part on storage. -Go to ①. +Go to ①. ``` The number of such parallel insert threads can be configured with the [`max_insert_threads`](/operations/settings/settings#max_insert_threads) setting. The default value is `1` for open-source ClickHouse and 4 for [ClickHouse Cloud](https://clickhouse.com/cloud). -With a large number of files, the parallel processing by multiple insert threads works well. It can fully saturate both the available CPU cores and the network bandwidth (for parallel file downloads). In scenarios where just a few large files will be loaded into a table, ClickHouse automatically establishes a high level of data processing parallelism and optimizes network bandwidth usage by spawning additional reader threads per insert thread for reading (downloading) more distinct ranges within large files in parallel. +With a large number of files, the parallel processing by multiple insert threads works well. It can fully saturate both the available CPU cores and the network bandwidth (for parallel file downloads). In scenarios where just a few large files will be loaded into a table, ClickHouse automatically establishes a high level of data processing parallelism and optimizes network bandwidth usage by spawning additional reader threads per insert thread for reading (downloading) more distinct ranges within large files in parallel. For the s3 function and table, parallel downloading of an individual file is determined by the values [max_download_threads](https://clickhouse.com/codebrowser/ClickHouse/src/Core/Settings.h.html#DB::SettingsTraits::Data::max_download_threads) and [max_download_buffer_size](https://clickhouse.com/codebrowser/ClickHouse/src/Core/Settings.h.html#DB::SettingsTraits::Data::max_download_buffer_size). Files will only be downloaded in parallel if their size is greater than `2 * max_download_buffer_size`. By default, the `max_download_buffer_size` default is set to 10MiB. In some cases, you can safely increase this buffer size to 50 MB (`max_download_buffer_size=52428800`), with the aim of ensuring each file was downloaded by a single thread. This can reduce the time each thread spends making S3 calls and thus also lower the S3 wait time. Furthermore, for files that are too small for parallel reading, to increase throughput, ClickHouse automatically prefetches data by pre-reading such files asynchronously. @@ -126,7 +126,7 @@ The number of available CPU cores and the size of RAM impacts the: - possible level of [insert parallelism](#insert-parallelism) - throughput of [background part merges](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part1#more-parts--more-background-part-merges) -and, therefore, the overall ingest throughput. + and, therefore, the overall ingest throughput. ## Region locality {#region-locality} @@ -144,9 +144,9 @@ ClickHouse can read files stored in S3 buckets in the [supported formats](/inter ## Example dataset {#example-dataset} -To illustrate further potential optimizations, purposes we will use [the posts from the Stack Overflow dataset](/data-modeling/schema-design#stack-overflow-dataset) - optimizing both the query and insert performance of this data. +To illustrate further potential optimizations, purposes we will use [the posts from the Stack Overflow dataset](/data-modeling/schema-design#stack-overflow-dataset) - optimizing both the query and insert performance of this data. -This dataset consists of 189 Parquet files, with one for every month between July 2008 and March 2024. +This dataset consists of 189 Parquet files, with one for every month between July 2008 and March 2024. Note that we use Parquet for performance, per our [recommendations above](#formats), executing all queries on a ClickHouse Cluster located in the same region as the bucket. This cluster has 3 nodes, each with 32GiB of RAM and 8 vCPUs. @@ -196,79 +196,79 @@ Read performance on S3 will scale linearly with the number of cores, provided yo * In low thread count scenarios, users may benefit from setting `remote_filesystem_read_method` to "read" to cause the synchronous reading of files from S3. * For the s3 function and table, parallel downloading of an individual file is determined by the values [`max_download_threads`](/operations/settings/settings#max_download_threads) and [`max_download_buffer_size`](/operations/settings/settings#max_download_buffer_size). While [`max_download_threads`](/operations/settings/settings#max_download_threads) controls the number of threads used, files will only be downloaded in parallel if their size is greater than 2 * `max_download_buffer_size`. By default, the `max_download_buffer_size` default is set to 10MiB. In some cases, you can safely increase this buffer size to 50 MB (`max_download_buffer_size=52428800`), with the aim of ensuring smaller files are only downloaded by a single thread. This can reduce the time each thread spends making S3 calls and thus also lower the S3 wait time. See [this blog post](https://clickhouse.com/blog/clickhouse-1-trillion-row-challenge) for an example of this. -Before making any changes to improve performance, ensure you measure appropriately. As S3 API calls are sensitive to latency and may impact client timings, use the query log for performance metrics, i.e., `system.query_log`. + Before making any changes to improve performance, ensure you measure appropriately. As S3 API calls are sensitive to latency and may impact client timings, use the query log for performance metrics, i.e., `system.query_log`. -Consider our earlier query, doubling the `max_threads` to `16` (default `max_thread` is the number of cores on a node) improves our read query performance by 2x at the expense of higher memory. Further increasing `max_threads` has diminishing returns as shown. + Consider our earlier query, doubling the `max_threads` to `16` (default `max_thread` is the number of cores on a node) improves our read query performance by 2x at the expense of higher memory. Further increasing `max_threads` has diminishing returns as shown. -```sql -SELECT + ```sql + SELECT OwnerDisplayName, count() AS num_posts -FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/by_month/*.parquet') -WHERE OwnerDisplayName NOT IN ('', 'anon') -GROUP BY OwnerDisplayName -ORDER BY num_posts DESC -LIMIT 5 -SETTINGS max_threads = 16 + FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/by_month/*.parquet') + WHERE OwnerDisplayName NOT IN ('', 'anon') + GROUP BY OwnerDisplayName + ORDER BY num_posts DESC + LIMIT 5 + SETTINGS max_threads = 16 -┌─OwnerDisplayName─┬─num_posts─┐ -│ user330315 │ 10344 │ -│ user4039065 │ 5316 │ -│ user149341 │ 4102 │ -│ user529758 │ 3700 │ -│ user3559349 │ 3068 │ -└──────────────────┴───────────┘ + ┌─OwnerDisplayName─┬─num_posts─┐ + │ user330315 │ 10344 │ + │ user4039065 │ 5316 │ + │ user149341 │ 4102 │ + │ user529758 │ 3700 │ + │ user3559349 │ 3068 │ + └──────────────────┴───────────┘ -5 rows in set. Elapsed: 1.505 sec. Processed 59.82 million rows, 24.03 GB (39.76 million rows/s., 15.97 GB/s.) -Peak memory usage: 178.58 MiB. + 5 rows in set. Elapsed: 1.505 sec. Processed 59.82 million rows, 24.03 GB (39.76 million rows/s., 15.97 GB/s.) + Peak memory usage: 178.58 MiB. -SETTINGS max_threads = 32 + SETTINGS max_threads = 32 -5 rows in set. Elapsed: 0.779 sec. Processed 59.82 million rows, 24.03 GB (76.81 million rows/s., 30.86 GB/s.) -Peak memory usage: 369.20 MiB. + 5 rows in set. Elapsed: 0.779 sec. Processed 59.82 million rows, 24.03 GB (76.81 million rows/s., 30.86 GB/s.) + Peak memory usage: 369.20 MiB. -SETTINGS max_threads = 64 + SETTINGS max_threads = 64 -5 rows in set. Elapsed: 0.674 sec. Processed 59.82 million rows, 24.03 GB (88.81 million rows/s., 35.68 GB/s.) -Peak memory usage: 639.99 MiB. -``` + 5 rows in set. Elapsed: 0.674 sec. Processed 59.82 million rows, 24.03 GB (88.81 million rows/s., 35.68 GB/s.) + Peak memory usage: 639.99 MiB. + ``` ## Tuning threads and block size for inserts {#tuning-threads-and-block-size-for-inserts} To achieve maximum ingestion performance, you must choose (1) an insert block size and (2) an appropriate level of insert parallelism based on (3) the amount of available CPU cores and RAM available. In summary: -- The larger we [configure the insert block size](#insert-block-size), the fewer parts ClickHouse has to create, and the fewer [disk file I/O](https://en.wikipedia.org/wiki/Category:Disk_file_systems) and [background merges](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part1#more-parts--more-background-part-merges) are required. +- The larger we [configure the insert block size](#insert-block-size), the fewer parts ClickHouse has to create, and the fewer [disk file I/O](https://en.wikipedia.org/wiki/Category:Disk_file_systems) and [background merges](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part1#more-parts--more-background-part-merges) are required. - The higher we configure the [number of parallel insert threads](#insert-parallelism), the faster the data will be processed. -There is a conflicting tradeoff between these two performance factors (plus a tradeoff with the background part merging). The amount of available main memory of ClickHouse servers is limited. Larger blocks use more main memory, which limits the number of parallel insert threads we can utilize. Conversely, a higher number of parallel insert threads requires more main memory, as the number of insert threads determines the number of insert blocks created in memory concurrently. This limits the possible size of insert blocks. Additionally, there can be resource contention between insert threads and background merge threads. A high number of configured insert threads (1) creates more parts that need to be merged and (2) takes away CPU cores and memory space from background merge threads. + There is a conflicting tradeoff between these two performance factors (plus a tradeoff with the background part merging). The amount of available main memory of ClickHouse servers is limited. Larger blocks use more main memory, which limits the number of parallel insert threads we can utilize. Conversely, a higher number of parallel insert threads requires more main memory, as the number of insert threads determines the number of insert blocks created in memory concurrently. This limits the possible size of insert blocks. Additionally, there can be resource contention between insert threads and background merge threads. A high number of configured insert threads (1) creates more parts that need to be merged and (2) takes away CPU cores and memory space from background merge threads. -For a detailed description of how the behavior of these parameters impacts performance and resources, we recommend [reading this blog post](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part2). As described in this blog post, tuning can involve a careful balance of the two parameters. This exhaustive testing is often impractical, so in summary, we recommend: + For a detailed description of how the behavior of these parameters impacts performance and resources, we recommend [reading this blog post](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part2). As described in this blog post, tuning can involve a careful balance of the two parameters. This exhaustive testing is often impractical, so in summary, we recommend: -```bash -• max_insert_threads: choose ~ half of the available CPU cores for insert threads (to leave enough dedicated cores for background merges) + ```bash + • max_insert_threads: choose ~ half of the available CPU cores for insert threads (to leave enough dedicated cores for background merges) -• peak_memory_usage_in_bytes: choose an intended peak memory usage; either all available RAM (if it is an isolated ingest) or half or less (to leave room for other concurrent tasks) + • peak_memory_usage_in_bytes: choose an intended peak memory usage; either all available RAM (if it is an isolated ingest) or half or less (to leave room for other concurrent tasks) -Then: -min_insert_block_size_bytes = peak_memory_usage_in_bytes / (~3 * max_insert_threads) -``` + Then: + min_insert_block_size_bytes = peak_memory_usage_in_bytes / (~3 * max_insert_threads) + ``` -With this formula, you can set `min_insert_block_size_rows` to 0 (to disable the row based threshold) while setting `max_insert_threads` to the chosen value and `min_insert_block_size_bytes` to the calculated result from the above formula. + With this formula, you can set `min_insert_block_size_rows` to 0 (to disable the row based threshold) while setting `max_insert_threads` to the chosen value and `min_insert_block_size_bytes` to the calculated result from the above formula. -Using this formula with our earlier Stack Overflow example. + Using this formula with our earlier Stack Overflow example. - `max_insert_threads=4` (8 cores per node) - `peak_memory_usage_in_bytes` - 32 GiB (100% of node resources) or `34359738368` bytes. - `min_insert_block_size_bytes` = `34359738368/(3*4) = 2863311530` -```sql -INSERT INTO posts SELECT * -FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/by_month/*.parquet') SETTINGS min_insert_block_size_rows=0, max_insert_threads=4, min_insert_block_size_bytes=2863311530 + ```sql + INSERT INTO posts SELECT * + FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/by_month/*.parquet') SETTINGS min_insert_block_size_rows=0, max_insert_threads=4, min_insert_block_size_bytes=2863311530 -0 rows in set. Elapsed: 128.566 sec. Processed 59.82 million rows, 24.03 GB (465.28 thousand rows/s., 186.92 MB/s.) -``` + 0 rows in set. Elapsed: 128.566 sec. Processed 59.82 million rows, 24.03 GB (465.28 thousand rows/s., 186.92 MB/s.) + ``` -As shown, tuning of these setting has improved insert performance by over `33%`. We leave this to the reader to see if they can improve single node performance further. + As shown, tuning of these setting has improved insert performance by over `33%`. We leave this to the reader to see if they can improve single node performance further. ## Scaling with resources and nodes {#scaling-with-resources-and-nodes} @@ -305,7 +305,7 @@ Individual nodes can also be bottlenecked by network and S3 GET requests, preven Eventually, horizontal scaling is often necessary due to hardware availability and cost-efficiency. In ClickHouse Cloud, production clusters have at least 3 nodes. Users may also wish to therefore utilize all nodes for an insert. -Utilizing a cluster for S3 reads requires using the `s3Cluster` function as described in [Utilizing Clusters](/integrations/s3#utilizing-clusters). This allows reads to be distributed across nodes. +Utilizing a cluster for S3 reads requires using the `s3Cluster` function as described in [Utilizing Clusters](/integrations/s3#utilizing-clusters). This allows reads to be distributed across nodes. The server that initially receives the insert query first resolves the glob pattern and then dispatches the processing of each matching file dynamically to itself and the other servers. @@ -315,7 +315,6 @@ We repeat our earlier read query distributing the workload across 3 nodes, adjus As noted in [Utilizing Clusters](/integrations/s3#utilizing-clusters) this work is distributed a file level. To benefit from this feature users will require a sufficient number of files i.e. at least > the number of nodes. - ```sql SELECT OwnerDisplayName, diff --git a/docs/integrations/data-sources/deltalake.md b/docs/integrations/data-sources/deltalake.md index 7e0efee394b..d260b687f48 100644 --- a/docs/integrations/data-sources/deltalake.md +++ b/docs/integrations/data-sources/deltalake.md @@ -9,6 +9,6 @@ import DeltaLakeFunction from '@site/docs/sql-reference/table-functions/deltalak # Delta Lake integration -Users can integrate with the Delta lake table format via the table function. +Users can integrate with the Delta lake table format via the table function. diff --git a/docs/integrations/data-sources/iceberg.md b/docs/integrations/data-sources/iceberg.md index cb6db487847..04a9dd262c1 100644 --- a/docs/integrations/data-sources/iceberg.md +++ b/docs/integrations/data-sources/iceberg.md @@ -9,6 +9,6 @@ import IcebergFunction from '@site/docs/sql-reference/table-functions/iceberg.md # Iceberg integration -Users can integrate with the Iceberg table format via the table function. +Users can integrate with the Iceberg table format via the table function. diff --git a/docs/integrations/data-sources/redis.md b/docs/integrations/data-sources/redis.md index b47d08ccb1d..848e653de22 100644 --- a/docs/integrations/data-sources/redis.md +++ b/docs/integrations/data-sources/redis.md @@ -9,6 +9,6 @@ import RedisFunction from '@site/docs/sql-reference/table-functions/redis.md'; # Redis integration -Users can integrate with Redis via the table function. +Users can integrate with Redis via the table function. diff --git a/docs/integrations/data-visualization/astrato-and-clickhouse.md b/docs/integrations/data-visualization/astrato-and-clickhouse.md index 60ca1cd3cb3..bd92cfb8cd6 100644 --- a/docs/integrations/data-visualization/astrato-and-clickhouse.md +++ b/docs/integrations/data-visualization/astrato-and-clickhouse.md @@ -35,36 +35,36 @@ When setting up your data connection, you'll need to know: - Database Credentials: Username, Password - + ## Creating the data connection to ClickHouse {#creating-the-data-connection-to-clickhouse} - Select **Data** in the sidebar, and select the **Data Connection** tab -(or, navigate to this link: https://app.astrato.io/data/sources) -​ + (or, navigate to this link: https://app.astrato.io/data/sources) + ​ - Click on the **New Data Connection** button in the top right side of the screen. - + - Select **ClickHouse**. - + - Complete the required fields in the connection dialogue box - + - Click **Test Connection**. If the connection is successful, give the data connection a **name** and click **Next.** - Set the **user access** to the data connection and click **connect.** - + -- A connection is created and a dataview is created. +- A connection is created and a dataview is created. -:::note -if a duplicate is created, a timestamp is added to the data source name. -::: + :::note + if a duplicate is created, a timestamp is added to the data source name. + ::: ## Creating a semantic model / data view {#creating-a-semantic-model--data-view} @@ -90,8 +90,7 @@ In just a few steps, you can build your first chart in Astrato. 3. Add dimension(s) 4. Add measure(s) - - + ### View generated SQL supporting each visualization {#view-generated-sql-supporting-each-visualization} @@ -99,7 +98,6 @@ Transparency and accuracy are at the heart of Astrato. We ensure that every quer - ### Example completed dashboard {#example-completed-dashboard} A beautiful complete dashboard or data app isn't far away now. To see more of what we've built, head to our demo gallery on our website. https://astrato.io/gallery diff --git a/docs/integrations/data-visualization/chartbrew-and-clickhouse.md b/docs/integrations/data-visualization/chartbrew-and-clickhouse.md index 21a6c30fbb5..b55a082863e 100644 --- a/docs/integrations/data-visualization/chartbrew-and-clickhouse.md +++ b/docs/integrations/data-visualization/chartbrew-and-clickhouse.md @@ -46,28 +46,28 @@ If you do not have a dataset to work with, you can add one of the examples. This 1. Log in to [Chartbrew](https://chartbrew.com/login) and go to the **Connections** tab. 2. Click **Create connection** and select **ClickHouse** from the available database options. - + 3. Enter the connection details for your ClickHouse database: - - **Display Name**: A name to identify the connection in Chartbrew. - - **Host**: The hostname or IP address of your ClickHouse server. - - **Port**: Typically `8443` for HTTPS connections. - - **Database Name**: The database you want to connect to. - - **Username**: Your ClickHouse username. - - **Password**: Your ClickHouse password. + - **Display Name**: A name to identify the connection in Chartbrew. + - **Host**: The hostname or IP address of your ClickHouse server. + - **Port**: Typically `8443` for HTTPS connections. + - **Database Name**: The database you want to connect to. + - **Username**: Your ClickHouse username. + - **Password**: Your ClickHouse password. - + 4. Click **Test connection** to verify that Chartbrew can connect to ClickHouse. 5. If the test is successful, click **Save connection**. Chartbrew will automatically retrieve the schema from ClickHouse. - + ## 3. Create a dataset and run a SQL query {#3-create-a-dataset-and-run-a-sql-query} - 1. Click on the **Create dataset** button or navigate to the **Datasets** tab to create one. - 2. Select the ClickHouse connection you created earlier. +1. Click on the **Create dataset** button or navigate to the **Datasets** tab to create one. +2. Select the ClickHouse connection you created earlier. @@ -91,11 +91,11 @@ If you do not have a dataset to work with, you can add one of the examples. This Once the data is retrieved, click **Configure dataset** to set up the visualization parameters. ## 4. Create a visualization {#4-create-a-visualization} - - 1. Define a metric (numerical value) and dimension (categorical value) for your visualization. - 2. Preview the dataset to ensure the query results are structured correctly. - 3. Choose a chart type (e.g., line chart, bar chart, pie chart) and add it to your dashboard. - 4. Click **Complete dataset** to finalize the setup. + +1. Define a metric (numerical value) and dimension (categorical value) for your visualization. +2. Preview the dataset to ensure the query results are structured correctly. +3. Choose a chart type (e.g., line chart, bar chart, pie chart) and add it to your dashboard. +4. Click **Complete dataset** to finalize the setup. @@ -104,12 +104,12 @@ Once the data is retrieved, click **Configure dataset** to set up the visualizat ## 5. Automate data updates {#5-automate-data-updates} - + To keep your dashboard up-to-date, you can schedule automatic data updates: - 1. Click the Calendar icon next to the dataset refresh button. - 2. Configure the update interval (e.g., every hour, every day). - 3. Save the settings to enable automatic refresh. +1. Click the Calendar icon next to the dataset refresh button. +2. Configure the update interval (e.g., every hour, every day). +3. Save the settings to enable automatic refresh. diff --git a/docs/integrations/data-visualization/deepnote.md b/docs/integrations/data-visualization/deepnote.md index de7aee622a8..c79e9eac8bf 100644 --- a/docs/integrations/data-visualization/deepnote.md +++ b/docs/integrations/data-visualization/deepnote.md @@ -18,7 +18,6 @@ import ConnectionDetails from '@site/docs/_snippets/_gather_your_details_http.md - Deepnote is a collaborative data notebook built for teams to discover and share insights. In addition to being Jupyter-compatible, it works in the cloud and provides you with one central place to collaborate and work on data science projects efficiently. This guide assumes you already have a Deepnote account and that you have a running ClickHouse instance. @@ -32,14 +31,14 @@ If you would like to explore an interactive example of querying ClickHouse from 1. Within Deepnote, select the "Integrations" overview and click on the ClickHouse tile. - + 2. Provide the connection details for your ClickHouse instance: - + - + - **_NOTE:_** If your connection to ClickHouse is protected with an IP Access List, you might need to allow Deepnote's IP addresses. Read more about it in [Deepnote's docs](https://docs.deepnote.com/integrations/authorize-connections-from-deepnote-ip-addresses). + **_NOTE:_** If your connection to ClickHouse is protected with an IP Access List, you might need to allow Deepnote's IP addresses. Read more about it in [Deepnote's docs](https://docs.deepnote.com/integrations/authorize-connections-from-deepnote-ip-addresses). 3. Congratulations! You have now integrated ClickHouse into Deepnote. @@ -47,7 +46,7 @@ If you would like to explore an interactive example of querying ClickHouse from 1. Start by connecting to the ClickHouse integration on the right of your notebook. - + 2. Now create a new ClickHouse query block and query your database. The query results will be saved as a DataFrame and stored in the variable specified in the SQL block. 3. You can also convert any existing [SQL block](https://docs.deepnote.com/features/sql-cells) to a ClickHouse block. diff --git a/docs/integrations/data-visualization/draxlr-and-clickhouse.md b/docs/integrations/data-visualization/draxlr-and-clickhouse.md index 86538fb13a1..fa8cc42fecd 100644 --- a/docs/integrations/data-visualization/draxlr-and-clickhouse.md +++ b/docs/integrations/data-visualization/draxlr-and-clickhouse.md @@ -23,7 +23,6 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; Draxlr offers an intuitive interface for connecting to your ClickHouse database, enabling your team to explore, visualize, and publish insights within minutes. This guide will walk you through the steps to establish a successful connection. - ## 1. Get your ClickHouse credentials {#1-get-your-clickhouse-credentials} @@ -39,7 +38,7 @@ Draxlr offers an intuitive interface for connecting to your ClickHouse database, 5. Add the connection details in the form. - + 6. Click on the **Next** button and wait for the connection to be established. You will see the tables page if the connection is successful. @@ -51,12 +50,11 @@ Draxlr offers an intuitive interface for connecting to your ClickHouse database, 3. You can start adding the filters, make joins and add sort to your data. - + 4. You can also use the **Graph** button and select the graph type to visualize the data. - - + ## 4. Using SQL queries {#4-using-sql-queries} @@ -64,16 +62,15 @@ Draxlr offers an intuitive interface for connecting to your ClickHouse database, 2. Click the **Raw Query** button and enter your query in the text area. - + 3. Click on the **Execute Query** button to see the results. - ## 4. Saving you query {#4-saving-you-query} 1. After executing your query, click on the **Save Query** button. - + 2. You can name to query in **Query Name** text box and select a folder to categories it. @@ -81,12 +78,11 @@ Draxlr offers an intuitive interface for connecting to your ClickHouse database, 4. Click on the **Save** button to save the query. - ## 5. Building dashboards {#5-building-dashboards} 1. Click on the **Dashboards** button on the navbar. - + 2. You can add a new dashboard by clicking on the **Add +** button on the left sidebar. diff --git a/docs/integrations/data-visualization/embeddable-and-clickhouse.md b/docs/integrations/data-visualization/embeddable-and-clickhouse.md index ba8d5c44558..cb420e7e8d4 100644 --- a/docs/integrations/data-visualization/embeddable-and-clickhouse.md +++ b/docs/integrations/data-visualization/embeddable-and-clickhouse.md @@ -19,7 +19,6 @@ The end result is the ability to deliver fast, interactive customer-facing analy Built-in row-level security means that every user only ever sees exactly the data they're allowed to see. And two levels of fully-configurable caching mean you can deliver fast, real time analytics at scale. - ## 1. Gather your connection details {#1-gather-your-connection-details} @@ -48,7 +47,6 @@ fetch('https://api.embeddable.com/api/v1/connections', { }), }); - Response: Status 201 { errorMessage: null } ``` @@ -60,12 +58,12 @@ The `apiKey` can be found by clicking "**Publish**" on one of your Embeddable da The `name` is a unique name to identify this connection. - By default your data models will look for a connection called "default", but you can supply your models with different `data_source` names to support connecting different data models to different connections (simply specify the data_source name in the model) -The `type` tells Embeddable which driver to use + The `type` tells Embeddable which driver to use - Here you'll want to use `clickhouse`, but you can connect multiple different data sources to one Embeddable workspace so you may use others such as: `postgres`, `bigquery`, `mongodb`, etc. -The `credentials` is a JavaScript object containing the necessary credentials expected by the driver + The `credentials` is a JavaScript object containing the necessary credentials expected by the driver - These are securely encrypted and only used to retrieve exactly the data you have described in your data models. -Embeddable strongly encourage you to create a read-only database user for each connection (Embeddable will only ever read from your database, not write). + Embeddable strongly encourage you to create a read-only database user for each connection (Embeddable will only ever read from your database, not write). -In order to support connecting to different databases for prod, qa, test, etc (or to support different databases for different customers) you can assign each connection to an environment (see [Environments API](https://docs.embeddable.com/data/environments)). + In order to support connecting to different databases for prod, qa, test, etc (or to support different databases for different customers) you can assign each connection to an environment (see [Environments API](https://docs.embeddable.com/data/environments)). diff --git a/docs/integrations/data-visualization/explo-and-clickhouse.md b/docs/integrations/data-visualization/explo-and-clickhouse.md index 9369532e9c1..9c47147f207 100644 --- a/docs/integrations/data-visualization/explo-and-clickhouse.md +++ b/docs/integrations/data-visualization/explo-and-clickhouse.md @@ -47,89 +47,88 @@ If you do not have a dataset to work with you can add one of the examples. This ## 1. Gather your connection details {#1-gather-your-connection-details} - ## 2. Connect Explo to ClickHouse {#2--connect-explo-to-clickhouse} 1. Sign up for an Explo account. 2. Click on the Explo **data** tab on the left hand sidebar. - + 3. Click **Connect Data Source** in the upper right hand side. - + 4. Fill out the information on the **Getting Started** page - + 5. Select **Clickhouse** - + 6. Enter your **Clickhouse Credentials**. - + 7. Configure **Security** - + 8. Within Clickhouse, **Whitelist the Explo IPs**. -` -54.211.43.19, 52.55.98.121, 3.214.169.94, and 54.156.141.148 -` + ` + 54.211.43.19, 52.55.98.121, 3.214.169.94, and 54.156.141.148 + ` ## 3. Create a Dashboard {#3-create-a-dashboard} 1. Navigate to **Dashboard** tab on the left side nav bar. - + 2. Click **Create Dashboard** in the upper right corner and name your dashboard. You've now created a dashboard! - + 3. You should now see a screen that is similar to this: - + ## 4. Run a SQL query {#4-run-a-sql-query} 1. Get your table name from the right hand sidebar under your schema title. You should then put the following command into your dataset editor: -` -SELECT * FROM YOUR_TABLE_NAME -LIMIT 100 -` + ` + SELECT * FROM YOUR_TABLE_NAME + LIMIT 100 + ` - + 2. Now click run and go to the preview tab to see your data. - + ## 5. Build a Chart {#5-build-a-chart} 1. From the left hand side, drag the bar chart icon onto the screen. - + 2. Select the dataset. You should now see a screen like the following: - + 3. Fill out the **county** in the X Axis and **Price** in the Y Axis Section like so: - + 4. Now, change the aggregation to **AVG**. - + 5. We now have average price of homes broken down by price! - + ## Learn more {#learn-more} diff --git a/docs/integrations/data-visualization/fabi-and-clickhouse.md b/docs/integrations/data-visualization/fabi-and-clickhouse.md index 2691475773d..98858c4eb39 100644 --- a/docs/integrations/data-visualization/fabi-and-clickhouse.md +++ b/docs/integrations/data-visualization/fabi-and-clickhouse.md @@ -18,7 +18,6 @@ import ConnectionDetails from '@site/docs/_snippets/_gather_your_details_http.md - Fabi.ai is an all-in-one collaborate data analysis platform. You can leverage SQL, Python, AI, and no-code to build dashboard and data workflows faster than ever before. Combined with the scale and power of ClickHouse, you can build and share your first highly performant dashboard on a massive dataset in minutes. @@ -32,12 +31,12 @@ import ConnectionDetails from '@site/docs/_snippets/_gather_your_details_http.md Log in or create your Fabi.ai account: https://app.fabi.ai/ 1. You’ll be prompted to connect your database when you first create your account, or if you already have an account, click on the data source panel on the left of any Smartbook and select Add Data Source. - - + + 2. You’ll then be prompted to enter your connection details. - + 3. Congratulations! You have now integrated ClickHouse into Fabi.ai. @@ -47,7 +46,6 @@ Once you’ve connected Fabi.ai to ClickHouse, go to any [Smartbook](https://doc - ## Additional Resources {#additional-resources} [Fabi.ai](https://www.fabi.ai) documentation: https://docs.fabi.ai/introduction diff --git a/docs/integrations/data-visualization/grafana/config.md b/docs/integrations/data-visualization/grafana/config.md index fd2186d62f2..70deec0c3f0 100644 --- a/docs/integrations/data-visualization/grafana/config.md +++ b/docs/integrations/data-visualization/grafana/config.md @@ -89,8 +89,8 @@ Example YAML for plain/secure headers: jsonData: httpHeaders: - name: X-Example-Plain-Header - value: plain text value - secure: false + value: plain text value + secure: false - name: X-Example-Secure-Header # "value" is excluded secure: true @@ -262,9 +262,9 @@ See [Grafana documentation](https://grafana.com/docs/grafana/latest/administrati ```yaml datasources: - name: Example ClickHouse - uid: clickhouse-example - type: grafana-clickhouse-datasource - jsonData: + uid: clickhouse-example + type: grafana-clickhouse-datasource + jsonData: host: 127.0.0.1 port: 9000 protocol: native @@ -307,9 +307,9 @@ datasources: startTimeColumn: tagsColumn: serviceTagsColumn: - secureJsonData: + secureJsonData: tlsCACert: tlsClientCert: tlsClientKey: secureHttpHeaders.X-Example-Secure-Header: secure header value -``` + ``` diff --git a/docs/integrations/data-visualization/grafana/index.md b/docs/integrations/data-visualization/grafana/index.md index e0112629062..cc6736ba6ff 100644 --- a/docs/integrations/data-visualization/grafana/index.md +++ b/docs/integrations/data-visualization/grafana/index.md @@ -71,7 +71,7 @@ Before Grafana can connect to ClickHouse, you need to install the appropriate Gr 2. Either scroll down and find the **ClickHouse** data source type, or you can search for it in the search bar of the **Add data source** page. Select the **ClickHouse** data source and the following page will appear: - + 3. Enter your server settings and credentials. The key settings are: @@ -81,7 +81,7 @@ Before Grafana can connect to ClickHouse, you need to install the appropriate Gr - **Secure connection** enable if your server requires a secure connection. - **Username** and **Password**: enter your ClickHouse user credentials. If you have not configured any users, try `default` for the username. It is recommended to [configure a read-only user](#2-making-a-read-only-user). -For more settings, check the [plugin configuration](./config.md) documentation. + For more settings, check the [plugin configuration](./config.md) documentation. 4. Click the **Save & test** button to verify that Grafana can connect to your ClickHouse service. If successful, you will see a **Data source is working** message: diff --git a/docs/integrations/data-visualization/grafana/query-builder.md b/docs/integrations/data-visualization/grafana/query-builder.md index de9b7667272..be4d321a13c 100644 --- a/docs/integrations/data-visualization/grafana/query-builder.md +++ b/docs/integrations/data-visualization/grafana/query-builder.md @@ -89,16 +89,15 @@ This query type will render the data in the logs panel along with a logs histogr Extra columns that are selected in the query can be viewed in the expanded log row: - ### Time series {#time-series} The time series query type is similar to [table](#table), but with a focus on time series data. The two views are mostly the same, with these notable differences: - - A dedicated *Time* field. - - In Aggregate mode, a time interval macro is automatically applied along with a Group By for the Time field. - - In Aggregate mode, the "Columns" field is hidden. - - A time range filter and Order By are automatically added for the **Time** field. +- A dedicated *Time* field. +- In Aggregate mode, a time interval macro is automatically applied along with a Group By for the Time field. +- In Aggregate mode, the "Columns" field is hidden. +- A time range filter and Order By are automatically added for the **Time** field. :::important Is your visualization missing data? In some cases the time series panel will appear to be cut off because the limit defaults to `1000`. @@ -210,7 +209,6 @@ Having defaults configured for both [logs](./config.md#logs) and [traces](./conf - ## Macros {#macros} Macros are a simple way to add dynamic SQL to your query. @@ -218,7 +216,6 @@ Before a query gets sent to the ClickHouse server, the plugin will expand the ma Queries from both the SQL Editor and Query Builder can use macros. - ### Using macros {#using-macros} Macros can be included anywhere in the query, multiple times if needed. diff --git a/docs/integrations/data-visualization/hashboard-and-clickhouse.md b/docs/integrations/data-visualization/hashboard-and-clickhouse.md index 07b8c116db5..512143c43d7 100644 --- a/docs/integrations/data-visualization/hashboard-and-clickhouse.md +++ b/docs/integrations/data-visualization/hashboard-and-clickhouse.md @@ -18,14 +18,12 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; [Hashboard](https://hashboard.com) is an interactive data exploration tool that enables anyone in your organization to track metrics and discover actionable insights. Hashboard issues live SQL queries to your ClickHouse database and is particularly useful for self-serve, ad hoc data exploration use cases. -
This guide will walk you through the steps to connect Hashboard with your ClickHouse instance. This information is also available on Hashboard's [ClickHouse integration documentation](https://docs.hashboard.com/docs/database-connections/clickhouse). - ## Pre-requisites {#pre-requisites} - A ClickHouse database either hosted on your own infrastructure or on [ClickHouse Cloud](https://clickhouse.com/). @@ -47,7 +45,7 @@ This guide will walk you through the steps to connect Hashboard with your ClickH 6. Click "Test" to validate that the connection is configured successfully. 7. Click "Add" -Your ClickHouse database is now be connected to Hashboard and you can proceed by building [Data Models](https://docs.hashboard.com/docs/data-modeling/add-data-model), [Explorations](https://docs.hashboard.com/docs/visualizing-data/explorations), [Metrics](https://docs.hashboard.com/docs/metrics), and [Dashboards](https://docs.hashboard.com/docs/dashboards). See the corresponding Hashboard documentation for more detail on these features. + Your ClickHouse database is now be connected to Hashboard and you can proceed by building [Data Models](https://docs.hashboard.com/docs/data-modeling/add-data-model), [Explorations](https://docs.hashboard.com/docs/visualizing-data/explorations), [Metrics](https://docs.hashboard.com/docs/metrics), and [Dashboards](https://docs.hashboard.com/docs/dashboards). See the corresponding Hashboard documentation for more detail on these features. ## Learn more {#learn-more} diff --git a/docs/integrations/data-visualization/index.md b/docs/integrations/data-visualization/index.md index 0e32a3ed2bd..b95fbc1414a 100644 --- a/docs/integrations/data-visualization/index.md +++ b/docs/integrations/data-visualization/index.md @@ -60,12 +60,12 @@ Now that your data is in ClickHouse, it's time to analyze it, which often involv | Looker | MySQL interface | 🚧 | ❌ | | | [Luzmo](./luzmo-and-clickhouse.md) | ClickHouse official connector | ✅ | ✅ | | | [Looker Studio](./looker-studio-and-clickhouse.md) | MySQL interface | ✅ | ✅ | | -| [Metabase](./metabase-and-clickhouse.md) | ClickHouse official connector | ✅ | ✅ | +| [Metabase](./metabase-and-clickhouse.md) | ClickHouse official connector | ✅ | ✅ | | [Mitzu](./mitzu-and-clickhouse.md) | Native connector | ✅ | ✅ | | | [Omni](./omni-and-clickhouse.md) | Native connector | ✅ | ✅ | | | [Power BI Desktop](./powerbi-and-clickhouse.md) | ClickHouse official connector | ✅ | ✅ | Via ODBC, supports direct query mode | | [Power BI service](/integrations/powerbi#power-bi-service) | ClickHouse official connector | ✅ | ✅ | A [Microsoft Data Gateway](https://learn.microsoft.com/en-us/power-bi/connect-data/service-gateway-custom-connectors) setup is required | -| [Rill](https://docs.rilldata.com/reference/olap-engines/clickhouse) | Native connector | ✅ | ✅ | +| [Rill](https://docs.rilldata.com/reference/olap-engines/clickhouse) | Native connector | ✅ | ✅ | | [Rocket BI](./rocketbi-and-clickhouse.md) | Native connector | ✅ | ❌ | | | [Tableau Desktop](./tableau/tableau-and-clickhouse.md) | ClickHouse official connector | ✅ | ✅ | | | [Tableau Online](./tableau/tableau-online-and-clickhouse.md) | MySQL interface | ✅ | ✅ | Works with some limitations, see [the documentation](./tableau/tableau-online-and-clickhouse.md) for more details | diff --git a/docs/integrations/data-visualization/looker-and-clickhouse.md b/docs/integrations/data-visualization/looker-and-clickhouse.md index acee94b3457..f525c10cce5 100644 --- a/docs/integrations/data-visualization/looker-and-clickhouse.md +++ b/docs/integrations/data-visualization/looker-and-clickhouse.md @@ -50,17 +50,17 @@ Now you should be able to attach ClickHouse Datasource to your Looker project. ## 3. Known limitations {#3-known-limitations} 1. The following data types are handled as strings by default: - * Array - serialization does not work as expected due to the JDBC driver limitations - * Decimal* - can be changed to number in the model - * LowCardinality(...) - can be changed to a proper type in the model - * Enum8, Enum16 - * UUID - * Tuple - * Map - * JSON - * Nested - * FixedString - * Geo types + * Array - serialization does not work as expected due to the JDBC driver limitations + * Decimal* - can be changed to number in the model + * LowCardinality(...) - can be changed to a proper type in the model + * Enum8, Enum16 + * UUID + * Tuple + * Map + * JSON + * Nested + * FixedString + * Geo types * MultiPolygon * Polygon * Point diff --git a/docs/integrations/data-visualization/luzmo-and-clickhouse.md b/docs/integrations/data-visualization/luzmo-and-clickhouse.md index 2e3fff028a5..82a6d71d482 100644 --- a/docs/integrations/data-visualization/luzmo-and-clickhouse.md +++ b/docs/integrations/data-visualization/luzmo-and-clickhouse.md @@ -28,12 +28,12 @@ You'll be asked to provide a **host**, **username** and **password**: -* **Host**: this is the host where your ClickHouse database is exposed. Note that only `https` is allowed here in order to securely transfer data over the wire. The structure of the host url expects: `https://url-to-clickhouse-db:port/database` +* **Host**: this is the host where your ClickHouse database is exposed. Note that only `https` is allowed here in order to securely transfer data over the wire. The structure of the host url expects: `https://url-to-clickhouse-db:port/database` By default, the plugin will connect to the 'default' database and the 443 port. By providing a database after the '/' you can configure which database to connect to. -* **Username**: the username that will be used to connect to your ClickHouse cluster. -* **Password**: the password to connect to your ClickHouse cluster +* **Username**: the username that will be used to connect to your ClickHouse cluster. +* **Password**: the password to connect to your ClickHouse cluster -Please refer to the examples in our developer documentation to find out how to [create a connection to ClickHouse](https://developer.luzmo.com/api/createAccount?exampleSection=AccountCreateClickhouseRequestBody) via our API. + Please refer to the examples in our developer documentation to find out how to [create a connection to ClickHouse](https://developer.luzmo.com/api/createAccount?exampleSection=AccountCreateClickhouseRequestBody) via our API. ## 2. Add datasets {#2-add-datasets} diff --git a/docs/integrations/data-visualization/metabase-and-clickhouse.md b/docs/integrations/data-visualization/metabase-and-clickhouse.md index e83f76863bc..d621c11cb9d 100644 --- a/docs/integrations/data-visualization/metabase-and-clickhouse.md +++ b/docs/integrations/data-visualization/metabase-and-clickhouse.md @@ -51,7 +51,6 @@ If you do not have a dataset to work with you can add one of the examples. This 5. Access Metabase at http://hostname:3000. On the initial startup, you will see a welcome screen and have to work your way through a list of questions. If prompted to select a database, select "**I'll add my data later**": - ## 3. Connect Metabase to ClickHouse {#3--connect-metabase-to-clickhouse} 1. Click on the gear icon in the top-right corner and select **Admin Settings** to visit your Metabase admin page. diff --git a/docs/integrations/data-visualization/mitzu-and-clickhouse.md b/docs/integrations/data-visualization/mitzu-and-clickhouse.md index 32bf477f2c1..756dca4be34 100644 --- a/docs/integrations/data-visualization/mitzu-and-clickhouse.md +++ b/docs/integrations/data-visualization/mitzu-and-clickhouse.md @@ -36,12 +36,12 @@ In this guide, we are going to cover the following: - Warehouse-native product analytics - How to integrate Mitzu to ClickHouse -:::tip Example datasets -If you do not have a data set to use for Mitzu, you can work with NYC Taxi Data. -This dataset is available in ClickHouse Cloud or [can be loaded with these instructions](/getting-started/example-datasets/nyc-taxi). -::: + :::tip Example datasets + If you do not have a data set to use for Mitzu, you can work with NYC Taxi Data. + This dataset is available in ClickHouse Cloud or [can be loaded with these instructions](/getting-started/example-datasets/nyc-taxi). + ::: -This guide is just a brief overview of how to use Mitzu. You can find more detailed information in the [Mitzu documentation](https://docs.mitzu.io/). + This guide is just a brief overview of how to use Mitzu. You can find more detailed information in the [Mitzu documentation](https://docs.mitzu.io/). ## 1. Gather your connection details {#1-gather-your-connection-details} @@ -127,7 +127,6 @@ Get immediate conversion rate insights without writing a single line of SQL code Pick `Weekly cohort retention` to visualize how your retention rates change over time. ::: - ## 7. Run journey queries {#7-run-journey-queries} Select up to 9 steps for a funnel. Choose the time window within which your users can finish the journey. The Mitzu journey chart gives you a visual map of every path users take through the selected events. diff --git a/docs/integrations/data-visualization/powerbi-and-clickhouse.md b/docs/integrations/data-visualization/powerbi-and-clickhouse.md index b2ad10b2025..4c093e9d190 100644 --- a/docs/integrations/data-visualization/powerbi-and-clickhouse.md +++ b/docs/integrations/data-visualization/powerbi-and-clickhouse.md @@ -36,9 +36,9 @@ There are several flavours of Power BI that you can use to visualise your data: * Power BI Desktop: A Windows desktop application for creating Dashboards and Visualisations * Power BI Service: Available within Azure as a SaaS to host the Dashboards created on Power BI Desktop -Power BI requires you to create your dashboards within the Desktop version and publish them to Power BI Service. + Power BI requires you to create your dashboards within the Desktop version and publish them to Power BI Service. -This tutorial will guide you through the process of: + This tutorial will guide you through the process of: * [Installing the ClickHouse ODBC Driver](#install-the-odbc-driver) * [Installing the ClickHouse Power BI Connector into Power BI Desktop](#power-bi-installation) @@ -77,7 +77,6 @@ Download the most recent [ClickHouse ODBC release](https://github.com/ClickHouse Execute the supplied `.msi` installer and follow the wizard. -
@@ -122,22 +121,20 @@ Select the connector, and enter in the ClickHouse instance credentials: * Port (required) - Your instance port. * Database - Your database name. * Options - Any ODBC option as listed - in [ClickHouse ODBC GitHub Page](https://github.com/ClickHouse/clickhouse-odbc#configuration) + in [ClickHouse ODBC GitHub Page](https://github.com/ClickHouse/clickhouse-odbc#configuration) * Data Connectivity mode - DirectQuery - -
+ -:::note -We advise selecting DirectQuery for querying ClickHouse directly. + :::note + We advise selecting DirectQuery for querying ClickHouse directly. -If you have a use case that has a small amount of data, you can choose import mode, and the entire data will be loaded to Power BI. -::: + If you have a use case that has a small amount of data, you can choose import mode, and the entire data will be loaded to Power BI. + ::: * Specify username and password - -
+ ### Query and Visualise Data {#query-and-visualise-data} @@ -181,7 +178,6 @@ Choose the Unicode version of the ODBC driver. Fill in the connection details. -
@@ -190,7 +186,7 @@ If you are using a deployment that has SSL enabled (e.g. ClickHouse Cloud or a s - `Host` should always have the protocol (i.e. `http://` or `https://`) omitted. - `Timeout` is an integer representing seconds. Default value: `30 seconds`. -::: + ::: ### Get data into Power BI {#get-data-into-power-bi} @@ -226,7 +222,6 @@ Finally, you should see the databases and tables in the Navigator view. Select t Once the import is complete, your ClickHouse Data should be accessible in Power BI as usual. - ## Known limitations {#known-limitations} ### UInt64 {#uint64} diff --git a/docs/integrations/data-visualization/rocketbi-and-clickhouse.md b/docs/integrations/data-visualization/rocketbi-and-clickhouse.md index becbd13b74a..cacdc3acc3b 100644 --- a/docs/integrations/data-visualization/rocketbi-and-clickhouse.md +++ b/docs/integrations/data-visualization/rocketbi-and-clickhouse.md @@ -82,7 +82,6 @@ Rename filters & Save Control to Dashboard - #### Create a date type control {#create-a-date-type-control} Choose a Date field as Main Date column: diff --git a/docs/integrations/data-visualization/splunk-and-clickhouse.md b/docs/integrations/data-visualization/splunk-and-clickhouse.md index 1c3518d821e..58032d9fe77 100644 --- a/docs/integrations/data-visualization/splunk-and-clickhouse.md +++ b/docs/integrations/data-visualization/splunk-and-clickhouse.md @@ -30,7 +30,6 @@ For ClickHouse specifically, we are leveraging the [Splunk DB Connect App](https The ideal use case for this integration is when you are using ClickHouse for large data sources such as NetFlow, Avro or Protobuf binary data, DNS, VPC flow logs, and other OTEL logs that can be shared with your team on Splunk to search and create dashboards. By using this approach, the data is not ingested into the Splunk index layer and is simply queried directly from ClickHouse similarly to other visualization integrations such as [Metabase](https://www.metabase.com/) or [Superset](https://superset.apache.org/). - ## Goal​ {#goal} In this guide, we will use the ClickHouse JDBC driver to connect ClickHouse to Splunk. We will install a local version of Splunk Enterprise but we are not indexing any data. Instead, we are using the search functions through the DB Connect query engine. @@ -64,11 +63,11 @@ Ensure that the DB Connect App is installed on Splunk Enterprise. You can find i - Click the green "Install" button next to Splunk DB Connect - Click "Restart Splunk" -If you're having issues installing the DB Connect App, please see [this link](https://splunkbase.splunk.com/app/2686) for additional instructions. + If you're having issues installing the DB Connect App, please see [this link](https://splunkbase.splunk.com/app/2686) for additional instructions. -Once you've verified that the DB Connect App is installed, add the java_home path to the DB Connect App in Configuration -> Settings, and click save then reset. + Once you've verified that the DB Connect App is installed, add the java_home path to the DB Connect App in Configuration -> Settings, and click save then reset. - + ## Configure JDBC for ClickHouse {#configure-jdbc-for-clickhouse} @@ -154,7 +153,6 @@ We will now create a dashboard by clicking Save As > Save to a Dashboard. Let's add another query that shows the average fare based on the number of passengers. - ```sql dbxquery query="SELECT passenger_count,avg(total_amount) FROM default.trips GROUP BY passenger_count;" connection="chc" @@ -166,7 +164,6 @@ This time, let's create a bar chart visualization and save it to the previous da Finally, let's add one more query that shows the correlation between the number of passengers and the distance of the trip: - ```sql dbxquery query="SELECT passenger_count, toYear(pickup_datetime) AS year, round(trip_distance) AS distance, count(* FROM default.trips) diff --git a/docs/integrations/data-visualization/superset-and-clickhouse.md b/docs/integrations/data-visualization/superset-and-clickhouse.md index 9ba0945649e..5f593c480cf 100644 --- a/docs/integrations/data-visualization/superset-and-clickhouse.md +++ b/docs/integrations/data-visualization/superset-and-clickhouse.md @@ -57,21 +57,18 @@ If you do not have a dataset to work with you can add one of the examples. This 1. Within Superset, select **Data** from the top menu and then **Databases** from the drop-down menu. Add a new database by clicking the **+ Database** button: - -
+ 2. In the first step, select **ClickHouse Connect** as the type of database: - -
+ 3. In the second step: - - Set SSL on or off. - - Enter the connection information that you collected earlier - - Specify the **DISPLAY NAME**: this can be any name you prefer. If you will be connecting to multiple ClickHouse databases then make the name more descriptive. + - Set SSL on or off. + - Enter the connection information that you collected earlier + - Specify the **DISPLAY NAME**: this can be any name you prefer. If you will be connecting to multiple ClickHouse databases then make the name more descriptive. - -
+ 4. Click the **CONNECT** and then **FINISH** buttons to complete the setup wizard, and you should see your database in the list of databases. @@ -81,43 +78,35 @@ If you do not have a dataset to work with you can add one of the examples. This 2. Click the button for adding a dataset. Select your new database as the datasource and you should see the tables defined in your database: - -
+ 3. Click the **ADD** button at the bottom of the dialog window and your table appears in the list of datasets. You are ready to build a dashboard and analyze your ClickHouse data! - ## 5. Creating charts and a dashboard in Superset {#5--creating-charts-and-a-dashboard-in-superset} If you are familiar with Superset, then you will feel right at home with this next section. If you are new to Superset, well...it's like a lot of the other cool visualization tools out there in the world - it doesn't take long to get started, but the details and nuances get learned over time as you use the tool. 1. You start with a dashboard. From the top menu in Superset, select **Dashboards**. Click the button in the upper-right to add a new dashboard. The following dashboard is named **UK property prices**: - -
+ 2. To create a new chart, select **Charts** from the top menu and click the button to add a new chart. You will be shown a lot of options. The following example shows a **Pie Chart** chart using the **uk_price_paid** dataset from the **CHOOSE A DATASET** drop-down: - -
+ 3. Superset pie charts need a **Dimension** and a **Metric**, the rest of the settings are optional. You can pick your own fields for the dimension and metric, this example uses the ClickHouse field `district` as the dimension and `AVG(price)` as the metric. - - -
+ + 5. If you prefer doughnut charts over pie, then you can set that and other options under **CUSTOMIZE**: - -
+ 6. Click the **SAVE** button to save the chart, then select **UK property prices** under the **ADD TO DASHBOARD** drop-down, then **SAVE & GO TO DASHBOARD** saves the chart and adds it to the dashboard: - -
+ 7. That's it. Building dashboards in Superset based on data in ClickHouse opens up a whole world of blazing fast data analytics! - -
+ diff --git a/docs/integrations/data-visualization/tableau/tableau-and-clickhouse.md b/docs/integrations/data-visualization/tableau/tableau-and-clickhouse.md index 127cb6b448c..25d4d80fd9c 100644 --- a/docs/integrations/data-visualization/tableau/tableau-and-clickhouse.md +++ b/docs/integrations/data-visualization/tableau/tableau-and-clickhouse.md @@ -32,23 +32,21 @@ The connector is based on ClickHouse's advanced [JDBC driver](/integrations/lang With this connector, Tableau integrates ClickHouse databases and tables as data sources. To enable this functionality, follow the setup guide bellow. - ## Setup required prior usage {#setup-required-prior-usage} - 1. Gather your connection details - + 2. Download and install Tableau - desktop. + desktop. 3. Follow `clickhouse-tableau-connector-jdbc` instructions to download the compatible version - of ClickHouse JDBC driver. + of ClickHouse JDBC driver. -:::note -Make sure you download the **clickhouse-jdbc-x.x.x-shaded-all.jar** JAR file. Currently, we recommended using versions `0.8.X`. -::: + :::note + Make sure you download the **clickhouse-jdbc-x.x.x-shaded-all.jar** JAR file. Currently, we recommended using versions `0.8.X`. + ::: 4. Store the JDBC driver in the following folder (based on your OS, if the folder doesn't exist you can create it): - macOS: `~/Library/Tableau/Drivers` @@ -64,24 +62,21 @@ source in Tableau that connects to the **TPCD** database in ClickHouse. 2. From the left-side menu, click on **More** under the **To a Server** section. Search for **ClickHouse by ClickHouse** in the available connectors list: - -
+ -:::note -Don't see the **ClickHouse by ClickHouse** connector in your connectors list? It might be related to an old Tableau Desktop version. -To solve that, consider upgrading your Tableau Desktop application, or [install the connector manually](#install-the-connector-manually). -::: + :::note + Don't see the **ClickHouse by ClickHouse** connector in your connectors list? It might be related to an old Tableau Desktop version. + To solve that, consider upgrading your Tableau Desktop application, or [install the connector manually](#install-the-connector-manually). + ::: 3. Click on **ClickHouse by ClickHouse** and the following dialog will pop up: - -
- + + 4. Click **Install and Restart Tableau**. Restart the application. 5. After restarting, the connector will have its full name: `ClickHouse JDBC by ClickHouse, Inc.`. When clicking it the following dialog will pop up: - -
+ 6. Enter your connection details: @@ -93,29 +88,25 @@ To solve that, consider upgrading your Tableau Desktop application, or [install | Username | **default** | | Password | *\***** | -:::note -When working with ClickHouse cloud, it's required to enable the SSL checkbox for secured connections. -::: -
+ :::note + When working with ClickHouse cloud, it's required to enable the SSL checkbox for secured connections. + ::: - -:::note -Our ClickHouse database is named **TPCD**, but you must set the **Database** to **default** in the dialog above, then -select **TPCD** for the **Schema** in the next step. (This is likely due to a bug in the connector, so this behavior -could change, but for now you must use **default** as the database.) -::: + :::note + Our ClickHouse database is named **TPCD**, but you must set the **Database** to **default** in the dialog above, then + select **TPCD** for the **Schema** in the next step. (This is likely due to a bug in the connector, so this behavior + could change, but for now you must use **default** as the database.) + ::: 7. Click the **Sign In** button and you should see a new Tableau workbook: - -
+ 8. Select **TPCD** from the **Schema** dropdown and you should see the list of tables in **TPCD**: - -
+ -You are now ready to build some visualizations in Tableau! + You are now ready to build some visualizations in Tableau! ## Building Visualizations in Tableau {#building-visualizations-in-tableau} @@ -123,59 +114,51 @@ Now that have a ClickHouse data source configured in Tableau, let's visualize th 1. Drag the **CUSTOMER** table onto the workbook. Notice the columns appear, but the data table is empty: - -
+ 2. Click the **Update Now** button and 100 rows from **CUSTOMER** will populate the table. - 3. Drag the **ORDERS** table into the workbook, then set **Custkey** as the relationship field between the two tables: - -
+ 4. You now have the **ORDERS** and **LINEITEM** tables associated with each other as your data source, so you can use - this relationship to answer questions about the data. Select the **Sheet 1** tab at the bottom of the workbook. + this relationship to answer questions about the data. Select the **Sheet 1** tab at the bottom of the workbook. - -
+ 5. Suppose you want to know how many specific items were ordered each year. Drag **OrderDate** from **ORDERS** into the - **Columns** section (the horizontal field), then drag **Quantity** from **LINEITEM** into the **Rows**. Tableau will - generate the following line chart: + **Columns** section (the horizontal field), then drag **Quantity** from **LINEITEM** into the **Rows**. Tableau will + generate the following line chart: - -
+ -Not a very exciting line chart, but the dataset was generated by a script and built for testing query performance, so -you will notice there is not a lot of variations in the simulated orders of the TCPD data. + Not a very exciting line chart, but the dataset was generated by a script and built for testing query performance, so + you will notice there is not a lot of variations in the simulated orders of the TCPD data. 6. Suppose you want to know the average order amount (in dollars) by quarter and also by shipping mode (air, mail, ship, - truck, etc.): + truck, etc.): - Click the **New Worksheet** tab create a new sheet - Drag **OrderDate** from **ORDERS** into **Columns** and change it from **Year** to **Quarter** - Drag **Shipmode** from **LINEITEM** into **Rows** -You should see the following: + You should see the following: - -
+ 7. The **Abc** values are just filling in the space until you drag a metric onto the table. Drag **Totalprice** from * - *ORDERS** onto the table. Notice the default calculation is to **SUM** the **Totalprices**: + *ORDERS** onto the table. Notice the default calculation is to **SUM** the **Totalprices**: - -
+ 8. Click on **SUM** and change the **Measure** to **Average**. From the same dropdown menu, select **Format** change the - **Numbers** to **Currency (Standard)**: + **Numbers** to **Currency (Standard)**: - -
+ -Well done! You have successfully connected Tableau to ClickHouse, and you have opened up a whole world of possibilities -for analyzing and visualizing your ClickHouse data. + Well done! You have successfully connected Tableau to ClickHouse, and you have opened up a whole world of possibilities + for analyzing and visualizing your ClickHouse data. ## Install the connector manually {#install-the-connector-manually} @@ -183,13 +166,13 @@ In case you use an outdated Tableau Desktop version that doesn't include the con 1. Download the latest taco file from [Tableau Exchange](https://exchange.tableau.com/products/1064) 2. Place the taco file in - * macOS: `~/Documents/My Tableau Repository/Connectors` - * Windows: `C:\Users\[Windows User]\Documents\My Tableau Repository\Connectors` + * macOS: `~/Documents/My Tableau Repository/Connectors` + * Windows: `C:\Users\[Windows User]\Documents\My Tableau Repository\Connectors` 3. Restart Tableau Desktop, if your setup went successfully, you will set the connector under the `New Data Source` section. ## Connection and analysis tips {#connection-and-analysis-tips} -For more guidance on optimizing your Tableau-ClickHouse integration, +For more guidance on optimizing your Tableau-ClickHouse integration, please visit [Connection Tips](/integrations/tableau/connection-tips) and [Analysis Tips](/integrations/tableau/analysis-tips). ## Tests {#tests} diff --git a/docs/integrations/data-visualization/tableau/tableau-connection-tips.md b/docs/integrations/data-visualization/tableau/tableau-connection-tips.md index 3392c21dc1c..435aaaeaac7 100644 --- a/docs/integrations/data-visualization/tableau/tableau-connection-tips.md +++ b/docs/integrations/data-visualization/tableau/tableau-connection-tips.md @@ -14,7 +14,7 @@ import Image from '@theme/IdealImage'; If the *Set Session ID* checkbox is activated on the Advanced tab (by default), feel free to set session level [settings](/operations/settings/settings/) using ```text SET my_setting=value; -``` +``` ## Advanced tab {#advanced-tab} In 99% of cases you don't need the Advanced tab, for the remaining 1% you can use the following settings: @@ -24,7 +24,7 @@ In 99% of cases you don't need the Advanced tab, for the remaining 1% you can us ```text UInt256=java.lang.Double,Int256=java.lang.Double ``` - Read more about mapping in the corresponding section + Read more about mapping in the corresponding section - **JDBC Driver URL Parameters**. You can pass the remaining [driver parameters](https://github.com/ClickHouse/clickhouse-jdbc#configuration), for example `jdbcCompliance`, in this field. Be careful, the parameter values must be passed in the URL Encoded format, and in the case of passing `custom_http_params` or `typeMappings` in this field and in the previous fields of the Advanced tab, the values of the preceding two fields on the Advanced tab have a higher priority - **Set Session ID** checkbox. It is needed to set session-level settings in Initial SQL tab, generates a `session_id` with a timestamp and a pseudo-random number in the format `"tableau-jdbc-connector-*{timestamp}*-*{number}*"` diff --git a/docs/integrations/data-visualization/tableau/tableau-online-and-clickhouse.md b/docs/integrations/data-visualization/tableau/tableau-online-and-clickhouse.md index 773c05dab1e..96f3ab3392f 100644 --- a/docs/integrations/data-visualization/tableau/tableau-online-and-clickhouse.md +++ b/docs/integrations/data-visualization/tableau/tableau-online-and-clickhouse.md @@ -58,20 +58,20 @@ NB: if you want to use Tableau Online in combination with Tableau Desktop and sh ## Connecting Tableau Online to ClickHouse (cloud or on-premise setup with SSL) {#connecting-tableau-online-to-clickhouse-cloud-or-on-premise-setup-with-ssl} -As it is not possible to provide the SSL certificates via the Tableau Online MySQL connection setup wizard, +As it is not possible to provide the SSL certificates via the Tableau Online MySQL connection setup wizard, the only way is to use Tableau Desktop to set the connection up, and then export it to Tableau Online. This process is, however, pretty straightforward. Run Tableau Desktop on a Windows or Mac machine, and select "Connect" -> "To a Server" -> "MySQL". -Likely, it will be required to install the MySQL driver on your machine first. -You can do that by following the setup guide that is displayed [here](https://www.tableau.com/support/drivers) if you select MySQL from the Data Source drop-down. +Likely, it will be required to install the MySQL driver on your machine first. +You can do that by following the setup guide that is displayed [here](https://www.tableau.com/support/drivers) if you select MySQL from the Data Source drop-down. If you have an M1 Mac, check [this troubleshooting thread](https://community.tableau.com/s/question/0D58b0000Ar6OhvCQE/unable-to-install-mysql-driver-for-m1-mac) for a driver installation workaround.
:::note -In the MySQL connection setup UI, make sure that the "SSL" option is enabled. -ClickHouse Cloud's SSL certificate is signed by [Let's Encrypt](https://letsencrypt.org/certificates/). +In the MySQL connection setup UI, make sure that the "SSL" option is enabled. +ClickHouse Cloud's SSL certificate is signed by [Let's Encrypt](https://letsencrypt.org/certificates/). You can download this root cert [here](https://letsencrypt.org/certs/isrgrootx1.pem). ::: @@ -80,7 +80,7 @@ Provide your ClickHouse Cloud instance MySQL user credentials and the path to th
-Choose the desired tables as usual (similarly to Tableau Online), +Choose the desired tables as usual (similarly to Tableau Online), and select "Server" -> "Publish Data Source" -> Tableau Cloud. @@ -98,7 +98,6 @@ Additionally, choose "Update workbook to use the published data source". Finally, click "Publish", and your datasource with embedded credentials will be opened automatically in Tableau Online. - ## Known limitations (ClickHouse 23.11) {#known-limitations-clickhouse-2311} All the known limitations has been fixed in ClickHouse `23.11`. If you encounter any other incompatibilities, please do not hesitate to [contact us](https://clickhouse.com/company/contact) or create a [new issue](https://github.com/ClickHouse/ClickHouse/issues). diff --git a/docs/integrations/data-visualization/zingdata-and-clickhouse.md b/docs/integrations/data-visualization/zingdata-and-clickhouse.md index 431509786c1..4fb7cded00c 100644 --- a/docs/integrations/data-visualization/zingdata-and-clickhouse.md +++ b/docs/integrations/data-visualization/zingdata-and-clickhouse.md @@ -29,7 +29,7 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; ## How to connect {#how-to-connect} 1. Gather your connection details. - + 2. Download or visit Zing Data @@ -44,17 +44,14 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; * To add a datasource on web, click on **Data Sources** on the top menu, click on **New Datasource** and select **Clickhouse** from the dropdown menu -
4. Fill out the connection details and click on **Check Connection**. -
5. If the connection is successful, Zing will proceed you to table selection. Select the required tables and click on **Save**. If Zing cannot connect to your data source, you'll see a message asking your to check your credentials and retry. If even after checking your credentials and retrying you still experience issues, reach out to Zing support here. -
6. Once the Clickhouse datasource is added, it will be available to everyone in your Zing organization, under the **Data Sources** / **Sources** tab. @@ -65,12 +62,10 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; 2. Click on a table under the table's list to create a chart. -
3. Use the visual query builder to pick the desired fields, aggregations, etc., and click on **Run Question**. -
4. If you familiar with SQL, you can also write custom SQL to run queries and create a chart. @@ -80,12 +75,10 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; 5. An example chart would look as follows. The question can be saved using the three-dot menu. You can comment on the chart, tag your team members, create real-time alerts, change the chart type, etc. -
6. Dashboards can be created using the "+" icon under **Dashboards** on the Home screen. Existing questions can be dragged in, to be displayed on the dashboard. -
## Related content {#related-content} diff --git a/docs/integrations/index.mdx b/docs/integrations/index.mdx index 21d030bc389..a7ac160ded7 100644 --- a/docs/integrations/index.mdx +++ b/docs/integrations/index.mdx @@ -189,12 +189,12 @@ ClickHouse integrations are organized by their support level: - **Partner integrations:** built or maintained, and supported by, third-party software vendors - **Community integrations:** built or maintained and supported by community members. No direct support is available besides the public GitHub repositories and community Slack channels -Each integration is further categorized into **Language client**, **Data ingestion**, **Data visualization** and **SQL client** categories. + Each integration is further categorized into **Language client**, **Data ingestion**, **Data visualization** and **SQL client** categories. -:::note -We are actively compiling this list of ClickHouse integrations below, so it's not exhaustive. Feel free to -[contribute](https://github.com/ClickHouse/clickhouse-docs#contributing) any relevant ClickHouse integration to the list. -::: + :::note + We are actively compiling this list of ClickHouse integrations below, so it's not exhaustive. Feel free to + [contribute](https://github.com/ClickHouse/clickhouse-docs#contributing) any relevant ClickHouse integration to the list. + ::: ## Core integrations diff --git a/docs/integrations/language-clients/go/index.md b/docs/integrations/language-clients/go/index.md index 70d096ce9ad..0ae8d893398 100644 --- a/docs/integrations/language-clients/go/index.md +++ b/docs/integrations/language-clients/go/index.md @@ -153,16 +153,16 @@ ClickHouse supports two official Go clients. These clients are complementary and * [clickhouse-go](https://github.com/ClickHouse/clickhouse-go) - High level language client which supports either the Go standard database/sql interface or the native interface. * [ch-go](https://github.com/ClickHouse/ch-go) - Low level client. Native interface only. -clickhouse-go provides a high-level interface, allowing users to query and insert data using row-orientated semantics and batching that are lenient with respect to data types - values will be converted provided no precision loss is potentially incurred. ch-go, meanwhile, provides an optimized column-orientated interface that provides fast data block streaming with low CPU and memory overhead at the expense of type strictness and more complex usage. + clickhouse-go provides a high-level interface, allowing users to query and insert data using row-orientated semantics and batching that are lenient with respect to data types - values will be converted provided no precision loss is potentially incurred. ch-go, meanwhile, provides an optimized column-orientated interface that provides fast data block streaming with low CPU and memory overhead at the expense of type strictness and more complex usage. -From version 2.3, Clickhouse-go utilizes ch-go for low-level functions such as encoding, decoding, and compression. Note that clickhouse-go also supports the Go `database/sql` interface standard. Both clients use the native format for their encoding to provide optimal performance and can communicate over the native ClickHouse protocol. clickhouse-go also supports HTTP as its transport mechanism for cases where users have a requirement to proxy or load balance traffic. + From version 2.3, Clickhouse-go utilizes ch-go for low-level functions such as encoding, decoding, and compression. Note that clickhouse-go also supports the Go `database/sql` interface standard. Both clients use the native format for their encoding to provide optimal performance and can communicate over the native ClickHouse protocol. clickhouse-go also supports HTTP as its transport mechanism for cases where users have a requirement to proxy or load balance traffic. -When choosing a client library, users should be aware of their respective pros and cons - see Choosing a Client Library. + When choosing a client library, users should be aware of their respective pros and cons - see Choosing a Client Library. -| | Native format | Native protocol | HTTP protocol | Row Orientated API | Column Orientated API | Type flexibility | Compression | Query Placeholders | -|:-------------:|:-------------:|:---------------:|:-------------:|:------------------:|:---------------------:|:----------------:|:-----------:|:------------------:| -| clickhouse-go | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| ch-go | ✅ | ✅ | | | ✅ | | ✅ | | + | | Native format | Native protocol | HTTP protocol | Row Orientated API | Column Orientated API | Type flexibility | Compression | Query Placeholders | + |:-------------:|:-------------:|:---------------:|:-------------:|:------------------:|:---------------------:|:----------------:|:-----------:|:------------------:| + | clickhouse-go | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | + | ch-go | ✅ | ✅ | | | ✅ | | ✅ | | ## Choosing a client {#choosing-a-client} @@ -177,14 +177,14 @@ The clickhouse-go client provides two API interfaces for communicating with Clic * ClickHouse client-specific API * `database/sql` standard - generic interface around SQL databases provided by Golang. -While the `database/sql` provides a database-agnostic interface, allowing developers to abstract their data store, it enforces some typing and query semantics that impact performance. For this reason, the client-specific API should be used where [performance is important](https://github.com/clickHouse/clickHouse-go#benchmark). However, users who wish to integrate ClickHouse into tooling, which supports multiple databases, may prefer to use the standard interface. + While the `database/sql` provides a database-agnostic interface, allowing developers to abstract their data store, it enforces some typing and query semantics that impact performance. For this reason, the client-specific API should be used where [performance is important](https://github.com/clickHouse/clickHouse-go#benchmark). However, users who wish to integrate ClickHouse into tooling, which supports multiple databases, may prefer to use the standard interface. -Both interfaces encode data using the [native format](/native-protocol/basics.md) and native protocol for communication. Additionally, the standard interface supports communication over HTTP. + Both interfaces encode data using the [native format](/native-protocol/basics.md) and native protocol for communication. Additionally, the standard interface supports communication over HTTP. -| | Native format | Native protocol | HTTP protocol | Bulk write support | Struct marshaling | Compression | Query Placeholders | -|:------------------:|:-------------:|:---------------:|:-------------:|:------------------:|:-----------------:|:-----------:|:------------------:| -| ClickHouse API | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | -| `database/sql` API | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | + | | Native format | Native protocol | HTTP protocol | Bulk write support | Struct marshaling | Compression | Query Placeholders | + |:------------------:|:-------------:|:---------------:|:-------------:|:------------------:|:-----------------:|:-----------:|:------------------:| + | ClickHouse API | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | + | `database/sql` API | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ## Installation {#installation} @@ -251,7 +251,6 @@ The client supports: | => 2.0 <= 2.2 | 1.17, 1.18 | | >= 2.3 | 1.18 | - ## ClickHouse client API {#clickhouse-client-api} All code examples for the ClickHouse Client API can be found [here](https://github.com/ClickHouse/clickhouse-go/tree/main/examples). @@ -302,8 +301,8 @@ When opening a connection, an Options struct can be used to control client behav * `ConnOpenStrategy` - determines how the list of node addresses should be consumed and used to open connections. See [Connecting to Multiple Nodes](#connecting-to-multiple-nodes). * `BlockBufferSize` - maximum number of blocks to decode into the buffer at once. Larger values will increase parallelization at the expense of memory. Block sizes are query dependent so while you can set this on the connection, we recommend you override per query based on the data it returns. Defaults to `2`. -```go -conn, err := clickhouse.Open(&clickhouse.Options{ + ```go + conn, err := clickhouse.Open(&clickhouse.Options{ Addr: []string{fmt.Sprintf("%s:%d", env.Host, env.Port)}, Auth: clickhouse.Auth{ Database: env.Database, @@ -331,12 +330,12 @@ conn, err := clickhouse.Open(&clickhouse.Options{ ConnMaxLifetime: time.Duration(10) * time.Minute, ConnOpenStrategy: clickhouse.ConnOpenInOrder, BlockBufferSize: 10, -}) -if err != nil { + }) + if err != nil { return err -} -``` -[Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/clickhouse_api/connect_settings.go) + } + ``` + [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/clickhouse_api/connect_settings.go) #### Connection pooling {#connection-pooling} @@ -460,16 +459,15 @@ fmt.Println(v.String()) [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/1c0d81d0b1388dbb9e09209e535667df212f4ae4/examples/clickhouse_api/multi_host.go#L26-L45) - Two connection strategies are available: * `ConnOpenInOrder` (default) - addresses are consumed in order. Later addresses are only utilized in case of failure to connect using addresses earlier in the list. This is effectively a failure-over strategy. * `ConnOpenRoundRobin` - Load is balanced across the addresses using a round-robin strategy. -This can be controlled through the option `ConnOpenStrategy` + This can be controlled through the option `ConnOpenStrategy` -```go -conn, err := clickhouse.Open(&clickhouse.Options{ + ```go + conn, err := clickhouse.Open(&clickhouse.Options{ Addr: []string{"127.0.0.1:9001", "127.0.0.1:9002", fmt.Sprintf("%s:%d", env.Host, env.Port)}, ConnOpenStrategy: clickhouse.ConnOpenRoundRobin, Auth: clickhouse.Auth{ @@ -477,17 +475,17 @@ conn, err := clickhouse.Open(&clickhouse.Options{ Username: env.Username, Password: env.Password, }, -}) -if err != nil { + }) + if err != nil { return err -} -v, err := conn.ServerVersion() -if err != nil { + } + v, err := conn.ServerVersion() + if err != nil { return err -} -``` + } + ``` -[Full Example](https://github.com/ClickHouse/clickhouse-go/blob/1c0d81d0b1388dbb9e09209e535667df212f4ae4/examples/clickhouse_api/multi_host.go#L50-L67) + [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/1c0d81d0b1388dbb9e09209e535667df212f4ae4/examples/clickhouse_api/multi_host.go#L50-L67) ### Execution {#execution} @@ -509,7 +507,6 @@ conn.Exec(context.Background(), "INSERT INTO example VALUES (1, 'test-1')") [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/clickhouse_api/exec.go) - Note the ability to pass a Context to the query. This can be used to pass specific query level settings - see [Using Context](#using-context). ### Batch Insert {#batch-insert} @@ -544,7 +541,6 @@ if err != nil { return err } - batch, err := conn.PrepareBatch(ctx, "INSERT INTO example") if err != nil { return err @@ -603,12 +599,10 @@ return batch.Send() [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/clickhouse_api/type_convert.go) - For a full summary of supported go types for each column type, see [Type Conversions](#type-conversions). ### Querying rows {#querying-rows} - Users can either query for a single row using the `QueryRow` method or obtain a cursor for iteration over a result set via `Query`. While the former accepts a destination for the data to be serialized into, the latter requires the to call `Scan` on each row. ```go @@ -761,7 +755,6 @@ for _, v := range result { } ``` - [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/clickhouse_api/select_struct.go) #### Scan struct {#scan-struct} @@ -1115,7 +1108,6 @@ if err != nil { return err } - batch, err := conn.PrepareBatch(ctx, "INSERT INTO example") if err != nil { return err @@ -1155,7 +1147,6 @@ if err := batch.Send(); err != nil { [Full Example - `flatten_nested=1`](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/clickhouse_api/nested.go#L123-L180) - Note: Nested columns must have the same dimensions. For example, in the above example, `Col_2_2` and `Col_2_1` must have the same number of elements. Due to a more straightforward interface and official support for nesting, we recommend `flatten_nested=0`. @@ -1512,7 +1503,6 @@ if err := batch.Send(); err != nil { [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/clickhouse_api/compression.go) - Additional compression techniques are available if using the standard interface over HTTP. See [database/sql API - Compression](#compression) for further details. ### Parameter binding {#parameter-binding} @@ -1682,7 +1672,6 @@ for i := 1; i <= 6; i++ { [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/clickhouse_api/context.go) - ### Progress/profile/log information {#progressprofilelog-information} Progress, Profile, and Log information can be requested on queries. Progress information will report statistics on the number of rows and bytes that have been read and processed in ClickHouse. Conversely, Profile information provides a summary of data returned to the client, including totals of bytes (uncompressed), rows, and blocks. Finally, log information provides statistics on threads, e.g., memory usage and data speed. @@ -1714,7 +1703,6 @@ rows.Close() [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/clickhouse_api/progress.go) - ### Dynamic scanning {#dynamic-scanning} Users may need to read tables for which they do not know the schema or type of the fields being returned. This is common in cases where ad-hoc data analysis is performed or generic tooling is written. To achieve this, column-type information is available on query responses. This can be used with Go reflection to create runtime instances of correctly typed variables which can be passed to Scan. @@ -1753,7 +1741,6 @@ for rows.Next() { [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/clickhouse_api/dynamic_scan_types.go) - ### External tables {#external-tables} [External tables](/engines/table-engines/special/external-data/) allow the client to send data to ClickHouse, with a SELECT query. This data is put in a temporary table and can be used in the query itself for evaluation. @@ -1842,7 +1829,6 @@ fmt.Printf("count: %d\n", count) Full details on exploiting tracing can be found under [OpenTelemetry support](/operations/opentelemetry/). - ## Database/SQL API {#databasesql-api} The `database/sql` or "standard" API allows users to use the client in scenarios where application code should be agnostic of the underlying databases by conforming to a standard interface. This comes at some expense - additional layers of abstraction and indirection and primitives which are not necessarily aligned with ClickHouse. These costs are, however, typically acceptable in scenarios where tooling needs to connect to multiple databases. @@ -1874,7 +1860,6 @@ func Connect() error { return conn.Ping() } - func ConnectDSN() error { env, err := GetStdTestEnvironment() if err != nil { @@ -1914,8 +1899,8 @@ The following parameters can be passed in the DSN string: * `skip_verify` - skip certificate verification (default is `false`) * `block_buffer_size` - allows users to control the block buffer size. See [`BlockBufferSize`](#connection-settings). (default is `2`) -```go -func ConnectSettings() error { + ```go + func ConnectSettings() error { env, err := GetStdTestEnvironment() if err != nil { return err @@ -1925,9 +1910,9 @@ func ConnectSettings() error { return err } return conn.Ping() -} -``` -[Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/std/connect_settings.go) + } + ``` + [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/std/connect_settings.go) #### Connection pooling {#connection-pooling-1} @@ -2043,7 +2028,6 @@ func ConnectSSL() error { } t.RootCAs = caCertPool - conn := clickhouse.OpenDB(&clickhouse.Options{ Addr: []string{fmt.Sprintf("%s:%d", env.Host, env.SslPort)}, Auth: clickhouse.Auth{ @@ -2131,7 +2115,6 @@ _, err = conn.Exec("INSERT INTO example VALUES (1, 'test-1')") [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/std/exec.go) - This method does not support receiving a context - by default, it executes with the background context. Users can use `ExecContext` if this is needed - see [Using Context](#using-context). ### Batch Insert {#batch-insert-1} @@ -2314,7 +2297,6 @@ The standard API supports the same compression algorithms as native [ClickHouse If using the `OpenDB` method to establish a connection, a Compression configuration can be passed. This includes the ability to specify the compression level (see below). If connecting via `sql.Open` with DSN, utilize the parameter `compress`. This can either be a specific compression algorithm i.e. `gzip`, `deflate`, `br`, `zstd` or `lz4` or a boolean flag. If set to true, `lz4` will be used. The default is `none` i.e. compression disabled. - ```go conn := clickhouse.OpenDB(&clickhouse.Options{ Addr: []string{fmt.Sprintf("%s:%d", env.Host, env.HttpPort)}, @@ -2332,7 +2314,6 @@ conn := clickhouse.OpenDB(&clickhouse.Options{ ``` [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/std/compression.go#L27-L76) - ```go conn, err := sql.Open("clickhouse", fmt.Sprintf("http://%s:%d?username=%s&password=%s&compress=gzip&compress_level=5", env.Host, env.HttpPort, env.Username, env.Password)) ``` @@ -2463,7 +2444,6 @@ for rows.Next() { [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/std/context.go) - ### Sessions {#sessions} While native connections inherently have a session, connections over HTTP require the user to create a session id for passing in a context as a setting. This allows the use of features, e.g., Temporary tables, which are bound to a session. @@ -2613,7 +2593,6 @@ for rows.Next() { } rows.Close() - var count uint64 if err := conn.QueryRowContext(ctx, "SELECT COUNT(*) FROM external_table_1").Scan(&count); err != nil { return err @@ -2631,7 +2610,6 @@ fmt.Printf("external_table_1 UNION external_table_2: %d\n", count) [Full Example](https://github.com/ClickHouse/clickhouse-go/blob/main/examples/std/external_data.go) - ### Open telemetry {#open-telemetry-1} ClickHouse allows a [trace context](/operations/opentelemetry/) to be passed as part of the native protocol. The client allows a Span to be created via the function `clickhouse.withSpan` and passed via the Context to achieve this. This is not supported when HTTP is used as transport. diff --git a/docs/integrations/language-clients/java/client/_snippets/_v0_7.mdx b/docs/integrations/language-clients/java/client/_snippets/_v0_7.mdx index 052965fed45..cc8c1bc7331 100644 --- a/docs/integrations/language-clients/java/client/_snippets/_v0_7.mdx +++ b/docs/integrations/language-clients/java/client/_snippets/_v0_7.mdx @@ -11,64 +11,53 @@ This library will be deprecated soon. Use the latest [Java Client](/integrations - ```xml - com.clickhouse - clickhouse-http-client - 0.7.2 +com.clickhouse +clickhouse-http-client +0.7.2 ``` - - ```kotlin // https://mvnrepository.com/artifact/com.clickhouse/clickhouse-http-client implementation("com.clickhouse:clickhouse-http-client:0.7.2") ``` - ```groovy // https://mvnrepository.com/artifact/com.clickhouse/clickhouse-http-client implementation 'com.clickhouse:clickhouse-http-client:0.7.2' ``` - Since version `0.5.0`, the driver uses a new client http library that needs to be added as a dependency. - - ```xml - org.apache.httpcomponents.client5 - httpclient5 - 5.3.1 +org.apache.httpcomponents.client5 +httpclient5 +5.3.1 ``` - - ```kotlin // https://mvnrepository.com/artifact/org.apache.httpcomponents.client5/httpclient5 implementation("org.apache.httpcomponents.client5:httpclient5:5.3.1") ``` - ```groovy // https://mvnrepository.com/artifact/org.apache.httpcomponents.client5/httpclient5 implementation 'org.apache.httpcomponents.client5:httpclient5:5.3.1' ``` - @@ -79,18 +68,18 @@ Connection URL Format: `protocol://host[:port][/database][?param[=value][¶m[ - `http://localhost:8443?ssl=true&sslmode=NONE` - `https://(https://explorer@play.clickhouse.com:443` -Connect to a single node: + Connect to a single node: -```java showLineNumbers -ClickHouseNode server = ClickHouseNode.of("http://localhost:8123/default?compress=0"); -``` -Connect to a cluster with multiple nodes: + ```java showLineNumbers + ClickHouseNode server = ClickHouseNode.of("http://localhost:8123/default?compress=0"); + ``` + Connect to a cluster with multiple nodes: -```java showLineNumbers -ClickHouseNodes servers = ClickHouseNodes.of( + ```java showLineNumbers + ClickHouseNodes servers = ClickHouseNodes.of( "jdbc:ch:http://server1.domain,server2.domain,server3.domain/my_db" + "?load_balancing_policy=random&health_check_interval=5000&failover=2"); -``` + ``` ## Query API {#query-api} @@ -124,15 +113,12 @@ try (ClickHouseClient client = ClickHouseClient.newInstance(ClickHouseProtocol.H } ``` - See [complete code example](https://github.com/ClickHouse/clickhouse-java/blob/main/examples/client/src/main/java/com/clickhouse/examples/jdbc/Main.java#L73) in the [repo](https://github.com/ClickHouse/clickhouse-java/tree/main/examples/client). ## Insert API {#insert-api} ```java showLineNumbers - - try (ClickHouseClient client = ClickHouseClient.newInstance(ClickHouseProtocol.HTTP); ClickHouseResponse response = client.read(servers).write() .format(ClickHouseFormat.RowBinaryWithNamesAndTypes) @@ -152,7 +138,6 @@ RowBinary format is described on its [page](/interfaces/formats#rowbinarywithnam There is an example of [code](https://github.com/ClickHouse/clickhouse-kafka-connect/blob/main/src/main/java/com/clickhouse/kafka/connect/sink/db/ClickHouseWriter.java#L622). - ## Features {#features} ### Compression {#compression} @@ -160,35 +145,29 @@ The client will by default use LZ4 compression, which requires this dependency: - ```xml - org.lz4 - lz4-java - 1.8.0 +org.lz4 +lz4-java +1.8.0 ``` - - ```kotlin // https://mvnrepository.com/artifact/org.lz4/lz4-java implementation("org.lz4:lz4-java:1.8.0") ``` - ```groovy // https://mvnrepository.com/artifact/org.lz4/lz4-java implementation 'org.lz4:lz4-java:1.8.0' ``` - - You can choose to use gzip instead by setting `compress_algorithm=gzip` in the connection URL. Alternatively, you can disable compression a few ways. @@ -196,14 +175,14 @@ Alternatively, you can disable compression a few ways. 1. Disable by setting `compress=0` in the connection URL: `http://localhost:8123/default?compress=0` 2. Disable via the client configuration: -```java showLineNumbers -ClickHouseClient client = ClickHouseClient.builder() - .config(new ClickHouseConfig(Map.of(ClickHouseClientOption.COMPRESS, false))) - .nodeSelector(ClickHouseNodeSelector.of(ClickHouseProtocol.HTTP)) - .build(); -``` + ```java showLineNumbers + ClickHouseClient client = ClickHouseClient.builder() + .config(new ClickHouseConfig(Map.of(ClickHouseClientOption.COMPRESS, false))) + .nodeSelector(ClickHouseNodeSelector.of(ClickHouseProtocol.HTTP)) + .build(); + ``` -See the [compression documentation](/data-compression/compression-modes) to learn more about different compression options. + See the [compression documentation](/data-compression/compression-modes) to learn more about different compression options. ### Multiple queries {#multiple-queries} @@ -303,16 +282,16 @@ The Java client chooses a ClickHouse node to send requests to, according to the 2. Managing node's status. 3. Optionally schedule a background process for node discovery (if auto-discovery is enabled) and run a health check. -Here is a list of options to configure load balancing: + Here is a list of options to configure load balancing: -| Property | Default | Description | -|-----------------------|-------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| load_balancing_policy | `""` | The load-balancing policy can be one of:

  • `firstAlive` - request is sent to the first healthy node from the managed node list
  • `random` - request is sent to a random node from the managed node list
  • `roundRobin` - request is sent to each node from the managed node list, in turn.
  • full qualified class name implementing `ClickHouseLoadBalancingPolicy` - custom load balancing policy
  • If it is not specified the request is sent to the first node from the managed node list | -| load_balancing_tags | `""` | Load balancing tags for filtering out nodes. Requests are sent only to nodes that have the specified tags | -| health_check_interval | `0` | Health check interval in milliseconds, zero or negative value means one-time. | -| health_check_method | `ClickHouseHealthCheckMethod.SELECT_ONE` | Health check method. Can be one of:
  • `ClickHouseHealthCheckMethod.SELECT_ONE` - check with `select 1` query
  • `ClickHouseHealthCheckMethod.PING` - protocol-specific check, which is generally faster
  • | -| node_check_interval | `0` | Node check interval in milliseconds, negative number is treated as zero. The node status is checked if the specified amount of time has passed since the last check.
    The difference between `health_check_interval` and `node_check_interval` is that the `health_check_interval` option schedules the background job, which checks the status for the list of nodes (all or faulty), but `node_check_interval` specifies the amount of time has passed since the last check for the particular node | -| check_all_nodes | `false` | Whether to perform a health check against all nodes or just faulty ones. | + | Property | Default | Description | + |-----------------------|-------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | load_balancing_policy | `""` | The load-balancing policy can be one of:
  • `firstAlive` - request is sent to the first healthy node from the managed node list
  • `random` - request is sent to a random node from the managed node list
  • `roundRobin` - request is sent to each node from the managed node list, in turn.
  • full qualified class name implementing `ClickHouseLoadBalancingPolicy` - custom load balancing policy
  • If it is not specified the request is sent to the first node from the managed node list | + | load_balancing_tags | `""` | Load balancing tags for filtering out nodes. Requests are sent only to nodes that have the specified tags | + | health_check_interval | `0` | Health check interval in milliseconds, zero or negative value means one-time. | + | health_check_method | `ClickHouseHealthCheckMethod.SELECT_ONE` | Health check method. Can be one of:
  • `ClickHouseHealthCheckMethod.SELECT_ONE` - check with `select 1` query
  • `ClickHouseHealthCheckMethod.PING` - protocol-specific check, which is generally faster
  • | + | node_check_interval | `0` | Node check interval in milliseconds, negative number is treated as zero. The node status is checked if the specified amount of time has passed since the last check.
    The difference between `health_check_interval` and `node_check_interval` is that the `health_check_interval` option schedules the background job, which checks the status for the list of nodes (all or faulty), but `node_check_interval` specifies the amount of time has passed since the last check for the particular node | + | check_all_nodes | `false` | Whether to perform a health check against all nodes or just faulty ones. | ### Failover and retry {#failover-and-retry} diff --git a/docs/integrations/language-clients/java/client/_snippets/_v0_8.mdx b/docs/integrations/language-clients/java/client/_snippets/_v0_8.mdx index 2bbf3ce862f..077a9af898b 100644 --- a/docs/integrations/language-clients/java/client/_snippets/_v0_8.mdx +++ b/docs/integrations/language-clients/java/client/_snippets/_v0_8.mdx @@ -9,33 +9,27 @@ The library provides its own API to send requests to a server. The library also - Maven Central (project web page): https://mvnrepository.com/artifact/com.clickhouse/client-v2 - Nightly builds (repository link): https://s01.oss.sonatype.org/content/repositories/snapshots/com/clickhouse/ -
    - -```xml +```xml - com.clickhouse - client-v2 - 0.9.0 +com.clickhouse +client-v2 +0.9.0 ``` - - ```kotlin // https://mvnrepository.com/artifact/com.clickhouse/client-v2 implementation("com.clickhouse:client-v2:0.9.0") ``` - ```groovy // https://mvnrepository.com/artifact/com.clickhouse/client-v2 implementation 'com.clickhouse:client-v2:0.9.0' ``` - @@ -94,8 +88,7 @@ Please use tools like [openssl](https://docs.openssl.org/master/man1/openssl/) t - get CN from an user certificate - `openssl x509 -noout -subject -in [user.cert]` - verify same value is set in database `select name, auth_type, auth_params from system.users where auth_type = 'ssl_certificate'` (query will output `auth_params` with something like ` {"common_names":["some_user"]}`) -::: - + ::: ## Configuration {#configuration} @@ -110,7 +103,7 @@ Configuration is defined during client creation. See `com.clickhouse.client.api. |-----------------------|-----------------------------|:--------------------------------------------| | `addEndpoint(String endpoint)` | - `enpoint` - URL formatted a server address. | Adds a server endpoint to list of available servers. Currently only one endpoint is supported.

    Default: `none`
    Enum: `none`
    Key: `none` | | `addEndpoint(Protocol protocol, String host, int port, boolean secure)` | - `protocol` - connection protocol `com.clickhouse.client.api.enums.Protocol#HTTP`.
    - `host` - IP or hostname of a server.
    - `secure` - if communication should use secure version of the protocol (HTTPS) | Adds a server endpoint to list of available servers. Currently only one endpoint is supported.

    Default: `none`
    Enum: `none`
    Key: `none` | -| `setOption(String key, String value)` | - `key` - String key of the client configuration option.
    - `value` - String value of the option | Sets raw value of client options. Useful when reading configuration from properties files. | +| `setOption(String key, String value)` | - `key` - String key of the client configuration option.
    - `value` - String value of the option | Sets raw value of client options. Useful when reading configuration from properties files. | | `setUsername(String username)` | - `username` - User's username to use while authentication | Sets username for an authentication method that is selected by further configuration

    Default: `default`
    Enum: `ClientConfigProperties.USER`
    Key: `user` | | `setPassword(String password)` | - `password` - secret value for password authentication | Sets a secret for password authentication and effectively selects as authentication method

    Default: -
    Enum: `ClientConfigProperties.PASSWORD`
    Key: `password` | | `setAccessToken(String accessToken)` | - `accessToken` - String representation of an access token | Sets an access token to authenticate with a sets corresponding authentication method

    Default: -
    Enum: `ClientConfigProperties.ACCESS_TOKEN`
    Key: `access_token` | @@ -130,7 +123,7 @@ Configuration is defined during client creation. See `com.clickhouse.client.api. | `setSocketLinger(int secondsToWait)` | - `secondsToWait` - number of seconds. | Set linger time for every TCP socket created by the client.

    Default: -
    Enum: `ClientConfigProperties.SOCKET_LINGER_OPT`
    Key: `socket_linger`| | `compressServerResponse(boolean enabled)` | - `enabled` - flag that indicates if the option should be enabled | Sets if server should compress its responses.

    Default: `true`
    Enum: `ClientConfigProperties.COMPRESS_SERVER_RESPONSE`
    Key: `compress` | | `compressClientRequest(boolean enabled)` | - `enabled` - flag that indicates if the option should be enabled | Sets if client should compress its requests.

    Default: `false`
    Enum: `ClientConfigProperties.COMPRESS_CLIENT_REQUEST`
    Key: `decompress` | -| `useHttpCompression(boolean enabled)` | - `enabled` - flag that indicates if the option should be enabled | Sets if HTTP compression should be used for client/server communications if corresponding options are enabled | +| `useHttpCompression(boolean enabled)` | - `enabled` - flag that indicates if the option should be enabled | Sets if HTTP compression should be used for client/server communications if corresponding options are enabled | | `appCompressedData(boolean enabled)` | - `enabled` - flag that indicates if the option should be enabled | Tell client that compression will be handled by application.

    Default: `false`
    Enum: `ClientConfigProperties.APP_COMPRESSED_DATA`
    Key: `app_compressed_data` | | `setLZ4UncompressedBufferSize(int size)` | - `size` - size in bytes | Sets size of a buffer that will receive uncompressed portion of a data stream. If buffer is underestimated - a new one will be created and corresponding warning will be present in logs.

    Default: `65536`
    Enum: `ClientConfigProperties.COMPRESSION_LZ4_UNCOMPRESSED_BUF_SIZE`
    Key: `compression.lz4.uncompressed_buffer_size`| | `disableNativeCompression` | - `disable` - flag that indicates if the option should be disabled | Disable native compression. If set to true then native compression will be disabled.

    Default: `false`
    Enum: `ClientConfigProperties.DISABLE_NATIVE_COMPRESSION`
    Key: `disable_native_compression` | @@ -182,7 +175,6 @@ Server side settings can be set on the client level once while creation (see `se .build()) { - // Operation level QuerySettings querySettings = new QuerySettings(); querySettings.serverSetting("session_timezone", "Europe/Zurich"); @@ -205,7 +197,6 @@ QuerySettings settings = new QuerySettings() When options are set via `setOption` method (either the `Client.Builder` or operation settings class) then custom header name should be prefixed with `http_header_`. Method `com.clickhouse.client.api.ClientConfigProperties#httpHeader()` may be handy in this case. - ## Common Definitions {#common-definitions} ### ClickHouseFormat {#clickhouseformat} @@ -216,92 +207,91 @@ Enum of [supported formats](/interfaces/formats). It includes all formats that C * `full` - the client can transcode data by itself and accepts a raw data stream * `-` - operation not supported by ClickHouse for this format -This client version supports: - -| Format | Input | Output | -|-------------------------------------------------------------------------------------------------------------------------------|:------:|:-------:| -| [TabSeparated](/interfaces/formats#tabseparated) | raw | raw | -| [TabSeparatedRaw](/interfaces/formats#tabseparatedraw) | raw | raw | -| [TabSeparatedWithNames](/interfaces/formats#tabseparatedwithnames) | raw | raw | -| [TabSeparatedWithNamesAndTypes](/interfaces/formats#tabseparatedwithnamesandtypes) | raw | raw | -| [TabSeparatedRawWithNames](/interfaces/formats#tabseparatedrawwithnames) | raw | raw | -| [TabSeparatedRawWithNamesAndTypes](/interfaces/formats#tabseparatedrawwithnamesandtypes) | raw | raw | -| [Template](/interfaces/formats#format-template) | raw | raw | -| [TemplateIgnoreSpaces](/interfaces/formats#templateignorespaces) | raw | - | -| [CSV](/interfaces/formats#csv) | raw | raw | -| [CSVWithNames](/interfaces/formats#csvwithnames) | raw | raw | -| [CSVWithNamesAndTypes](/interfaces/formats#csvwithnamesandtypes) | raw | raw | -| [CustomSeparated](/interfaces/formats#format-customseparated) | raw | raw | -| [CustomSeparatedWithNames](/interfaces/formats#customseparatedwithnames) | raw | raw | -| [CustomSeparatedWithNamesAndTypes](/interfaces/formats#customseparatedwithnamesandtypes) | raw | raw | -| [SQLInsert](/interfaces/formats#sqlinsert) | - | raw | -| [Values](/interfaces/formats#data-format-values) | raw | raw | -| [Vertical](/interfaces/formats#vertical) | - | raw | -| [JSON](/interfaces/formats#json) | raw | raw | -| [JSONAsString](/interfaces/formats#jsonasstring) | raw | - | -| [JSONAsObject](/interfaces/formats#jsonasobject) | raw | - | -| [JSONStrings](/interfaces/formats#jsonstrings) | raw | raw | -| [JSONColumns](/interfaces/formats#jsoncolumns) | raw | raw | -| [JSONColumnsWithMetadata](/interfaces/formats#jsoncolumnsmonoblock) | raw | raw | -| [JSONCompact](/interfaces/formats#jsoncompact) | raw | raw | -| [JSONCompactStrings](/interfaces/formats#jsoncompactstrings) | - | raw | -| [JSONCompactColumns](/interfaces/formats#jsoncompactcolumns) | raw | raw | -| [JSONEachRow](/interfaces/formats#jsoneachrow) | raw | raw | -| [PrettyJSONEachRow](/interfaces/formats#prettyjsoneachrow) | - | raw | -| [JSONEachRowWithProgress](/interfaces/formats#jsoneachrowwithprogress) | - | raw | -| [JSONStringsEachRow](/interfaces/formats#jsonstringseachrow) | raw | raw | -| [JSONStringsEachRowWithProgress](/interfaces/formats#jsonstringseachrowwithprogress) | - | raw | -| [JSONCompactEachRow](/interfaces/formats#jsoncompacteachrow) | raw | raw | -| [JSONCompactEachRowWithNames](/interfaces/formats#jsoncompacteachrowwithnames) | raw | raw | -| [JSONCompactEachRowWithNamesAndTypes](/interfaces/formats#jsoncompacteachrowwithnamesandtypes) | raw | raw | -| [JSONCompactStringsEachRow](/interfaces/formats#jsoncompactstringseachrow) | raw | raw | -| [JSONCompactStringsEachRowWithNames](/interfaces/formats#jsoncompactstringseachrowwithnames) | raw | raw | -| [JSONCompactStringsEachRowWithNamesAndTypes](/interfaces/formats#jsoncompactstringseachrowwithnamesandtypes) | raw | raw | -| [JSONObjectEachRow](/interfaces/formats#jsonobjecteachrow) | raw | raw | -| [BSONEachRow](/interfaces/formats#bsoneachrow) | raw | raw | -| [TSKV](/interfaces/formats#tskv) | raw | raw | -| [Pretty](/interfaces/formats#pretty) | - | raw | -| [PrettyNoEscapes](/interfaces/formats#prettynoescapes) | - | raw | -| [PrettyMonoBlock](/interfaces/formats#prettymonoblock) | - | raw | -| [PrettyNoEscapesMonoBlock](/interfaces/formats#prettynoescapesmonoblock) | - | raw | -| [PrettyCompact](/interfaces/formats#prettycompact) | - | raw | -| [PrettyCompactNoEscapes](/interfaces/formats#prettycompactnoescapes) | - | raw | -| [PrettyCompactMonoBlock](/interfaces/formats#prettycompactmonoblock) | - | raw | -| [PrettyCompactNoEscapesMonoBlock](/interfaces/formats#prettycompactnoescapesmonoblock) | - | raw | -| [PrettySpace](/interfaces/formats#prettyspace) | - | raw | -| [PrettySpaceNoEscapes](/interfaces/formats#prettyspacenoescapes) | - | raw | -| [PrettySpaceMonoBlock](/interfaces/formats#prettyspacemonoblock) | - | raw | -| [PrettySpaceNoEscapesMonoBlock](/interfaces/formats#prettyspacenoescapesmonoblock) | - | raw | -| [Prometheus](/interfaces/formats#prometheus) | - | raw | -| [Protobuf](/interfaces/formats#protobuf) | raw | raw | -| [ProtobufSingle](/interfaces/formats#protobufsingle) | raw | raw | -| [ProtobufList](/interfaces/formats#protobuflist) | raw | raw | -| [Avro](/interfaces/formats#data-format-avro) | raw | raw | -| [AvroConfluent](/interfaces/formats#data-format-avro-confluent) | raw | - | -| [Parquet](/interfaces/formats#data-format-parquet) | raw | raw | -| [ParquetMetadata](/interfaces/formats#data-format-parquet-metadata) | raw | - | -| [Arrow](/interfaces/formats#data-format-arrow) | raw | raw | -| [ArrowStream](/interfaces/formats#data-format-arrow-stream) | raw | raw | -| [ORC](/interfaces/formats#data-format-orc) | raw | raw | -| [One](/interfaces/formats#data-format-one) | raw | - | -| [Npy](/interfaces/formats#data-format-npy) | raw | raw | -| [RowBinary](/interfaces/formats#rowbinary) | full | full | -| [RowBinaryWithNames](/interfaces/formats#rowbinarywithnamesandtypes) | full | full | -| [RowBinaryWithNamesAndTypes](/interfaces/formats#rowbinarywithnamesandtypes) | full | full | -| [RowBinaryWithDefaults](/interfaces/formats#rowbinarywithdefaults) | full | - | -| [Native](/interfaces/formats#native) | full | raw | -| [Null](/interfaces/formats#null) | - | raw | -| [XML](/interfaces/formats#xml) | - | raw | -| [CapnProto](/interfaces/formats#capnproto) | raw | raw | -| [LineAsString](/interfaces/formats#lineasstring) | raw | raw | -| [Regexp](/interfaces/formats#data-format-regexp) | raw | - | -| [RawBLOB](/interfaces/formats#rawblob) | raw | raw | -| [MsgPack](/interfaces/formats#msgpack) | raw | raw | -| [MySQLDump](/interfaces/formats#mysqldump) | raw | - | -| [DWARF](/interfaces/formats#dwarf) | raw | - | -| [Markdown](/interfaces/formats#markdown) | - | raw | -| [Form](/interfaces/formats#form) | raw | - | - + This client version supports: + + | Format | Input | Output | + |-------------------------------------------------------------------------------------------------------------------------------|:------:|:-------:| + | [TabSeparated](/interfaces/formats#tabseparated) | raw | raw | + | [TabSeparatedRaw](/interfaces/formats#tabseparatedraw) | raw | raw | + | [TabSeparatedWithNames](/interfaces/formats#tabseparatedwithnames) | raw | raw | + | [TabSeparatedWithNamesAndTypes](/interfaces/formats#tabseparatedwithnamesandtypes) | raw | raw | + | [TabSeparatedRawWithNames](/interfaces/formats#tabseparatedrawwithnames) | raw | raw | + | [TabSeparatedRawWithNamesAndTypes](/interfaces/formats#tabseparatedrawwithnamesandtypes) | raw | raw | + | [Template](/interfaces/formats#format-template) | raw | raw | + | [TemplateIgnoreSpaces](/interfaces/formats#templateignorespaces) | raw | - | + | [CSV](/interfaces/formats#csv) | raw | raw | + | [CSVWithNames](/interfaces/formats#csvwithnames) | raw | raw | + | [CSVWithNamesAndTypes](/interfaces/formats#csvwithnamesandtypes) | raw | raw | + | [CustomSeparated](/interfaces/formats#format-customseparated) | raw | raw | + | [CustomSeparatedWithNames](/interfaces/formats#customseparatedwithnames) | raw | raw | + | [CustomSeparatedWithNamesAndTypes](/interfaces/formats#customseparatedwithnamesandtypes) | raw | raw | + | [SQLInsert](/interfaces/formats#sqlinsert) | - | raw | + | [Values](/interfaces/formats#data-format-values) | raw | raw | + | [Vertical](/interfaces/formats#vertical) | - | raw | + | [JSON](/interfaces/formats#json) | raw | raw | + | [JSONAsString](/interfaces/formats#jsonasstring) | raw | - | + | [JSONAsObject](/interfaces/formats#jsonasobject) | raw | - | + | [JSONStrings](/interfaces/formats#jsonstrings) | raw | raw | + | [JSONColumns](/interfaces/formats#jsoncolumns) | raw | raw | + | [JSONColumnsWithMetadata](/interfaces/formats#jsoncolumnsmonoblock) | raw | raw | + | [JSONCompact](/interfaces/formats#jsoncompact) | raw | raw | + | [JSONCompactStrings](/interfaces/formats#jsoncompactstrings) | - | raw | + | [JSONCompactColumns](/interfaces/formats#jsoncompactcolumns) | raw | raw | + | [JSONEachRow](/interfaces/formats#jsoneachrow) | raw | raw | + | [PrettyJSONEachRow](/interfaces/formats#prettyjsoneachrow) | - | raw | + | [JSONEachRowWithProgress](/interfaces/formats#jsoneachrowwithprogress) | - | raw | + | [JSONStringsEachRow](/interfaces/formats#jsonstringseachrow) | raw | raw | + | [JSONStringsEachRowWithProgress](/interfaces/formats#jsonstringseachrowwithprogress) | - | raw | + | [JSONCompactEachRow](/interfaces/formats#jsoncompacteachrow) | raw | raw | + | [JSONCompactEachRowWithNames](/interfaces/formats#jsoncompacteachrowwithnames) | raw | raw | + | [JSONCompactEachRowWithNamesAndTypes](/interfaces/formats#jsoncompacteachrowwithnamesandtypes) | raw | raw | + | [JSONCompactStringsEachRow](/interfaces/formats#jsoncompactstringseachrow) | raw | raw | + | [JSONCompactStringsEachRowWithNames](/interfaces/formats#jsoncompactstringseachrowwithnames) | raw | raw | + | [JSONCompactStringsEachRowWithNamesAndTypes](/interfaces/formats#jsoncompactstringseachrowwithnamesandtypes) | raw | raw | + | [JSONObjectEachRow](/interfaces/formats#jsonobjecteachrow) | raw | raw | + | [BSONEachRow](/interfaces/formats#bsoneachrow) | raw | raw | + | [TSKV](/interfaces/formats#tskv) | raw | raw | + | [Pretty](/interfaces/formats#pretty) | - | raw | + | [PrettyNoEscapes](/interfaces/formats#prettynoescapes) | - | raw | + | [PrettyMonoBlock](/interfaces/formats#prettymonoblock) | - | raw | + | [PrettyNoEscapesMonoBlock](/interfaces/formats#prettynoescapesmonoblock) | - | raw | + | [PrettyCompact](/interfaces/formats#prettycompact) | - | raw | + | [PrettyCompactNoEscapes](/interfaces/formats#prettycompactnoescapes) | - | raw | + | [PrettyCompactMonoBlock](/interfaces/formats#prettycompactmonoblock) | - | raw | + | [PrettyCompactNoEscapesMonoBlock](/interfaces/formats#prettycompactnoescapesmonoblock) | - | raw | + | [PrettySpace](/interfaces/formats#prettyspace) | - | raw | + | [PrettySpaceNoEscapes](/interfaces/formats#prettyspacenoescapes) | - | raw | + | [PrettySpaceMonoBlock](/interfaces/formats#prettyspacemonoblock) | - | raw | + | [PrettySpaceNoEscapesMonoBlock](/interfaces/formats#prettyspacenoescapesmonoblock) | - | raw | + | [Prometheus](/interfaces/formats#prometheus) | - | raw | + | [Protobuf](/interfaces/formats#protobuf) | raw | raw | + | [ProtobufSingle](/interfaces/formats#protobufsingle) | raw | raw | + | [ProtobufList](/interfaces/formats#protobuflist) | raw | raw | + | [Avro](/interfaces/formats#data-format-avro) | raw | raw | + | [AvroConfluent](/interfaces/formats#data-format-avro-confluent) | raw | - | + | [Parquet](/interfaces/formats#data-format-parquet) | raw | raw | + | [ParquetMetadata](/interfaces/formats#data-format-parquet-metadata) | raw | - | + | [Arrow](/interfaces/formats#data-format-arrow) | raw | raw | + | [ArrowStream](/interfaces/formats#data-format-arrow-stream) | raw | raw | + | [ORC](/interfaces/formats#data-format-orc) | raw | raw | + | [One](/interfaces/formats#data-format-one) | raw | - | + | [Npy](/interfaces/formats#data-format-npy) | raw | raw | + | [RowBinary](/interfaces/formats#rowbinary) | full | full | + | [RowBinaryWithNames](/interfaces/formats#rowbinarywithnamesandtypes) | full | full | + | [RowBinaryWithNamesAndTypes](/interfaces/formats#rowbinarywithnamesandtypes) | full | full | + | [RowBinaryWithDefaults](/interfaces/formats#rowbinarywithdefaults) | full | - | + | [Native](/interfaces/formats#native) | full | raw | + | [Null](/interfaces/formats#null) | - | raw | + | [XML](/interfaces/formats#xml) | - | raw | + | [CapnProto](/interfaces/formats#capnproto) | raw | raw | + | [LineAsString](/interfaces/formats#lineasstring) | raw | raw | + | [Regexp](/interfaces/formats#data-format-regexp) | raw | - | + | [RawBLOB](/interfaces/formats#rawblob) | raw | raw | + | [MsgPack](/interfaces/formats#msgpack) | raw | raw | + | [MySQLDump](/interfaces/formats#mysqldump) | raw | - | + | [DWARF](/interfaces/formats#dwarf) | raw | - | + | [Markdown](/interfaces/formats#markdown) | - | raw | + | [Form](/interfaces/formats#form) | raw | - | ## Insert API {#insert-api} @@ -371,14 +361,13 @@ Future of `InsertResponse` type - the result of the operation and additional inf **Examples** ```java showLineNumbers -// Important step (done once) - register class to pre-compile object serializer according to the table schema. +// Important step (done once) - register class to pre-compile object serializer according to the table schema. client.register(ArticleViewEvent.class, client.getTableSchema(TABLE_NAME)); - List events = loadBatch(); try (InsertResponse response = client.insert(TABLE_NAME, events).get()) { - // handle response, then it will be closed and connection that served request will be released. + // handle response, then it will be closed and connection that served request will be released. } ``` @@ -419,7 +408,7 @@ Sends `sqlQuery` as is. Response format is set by query settings. `QueryResponse **Signatures** -```java +```java CompletableFuture query(String sqlQuery, QuerySettings settings) CompletableFuture query(String sqlQuery) ``` @@ -436,7 +425,7 @@ Future of `QueryResponse` type - a result dataset and additional information lik **Examples** -```java +```java final String sql = "select * from " + TABLE_NAME + " where title <> '' limit 10"; // Default format is RowBinaryWithNamesAndTypesFormatReader so reader have all information about columns @@ -453,13 +442,13 @@ try (QueryResponse response = client.query(sql).get(3, TimeUnit.SECONDS);) { String title = reader.getString("title"); String url = reader.getString("url"); - // collecting data + // collecting data } } catch (Exception e) { log.error("Failed to read data", e); } -// put business logic outside of the reading block to release http connection asap. +// put business logic outside of the reading block to release http connection asap. ``` ### query(String sqlQuery, Map<String, Object> queryParams, QuerySettings settings) {#querystring-sqlquery-mapltstring-object-queryparams-querysettings-settings} @@ -467,7 +456,7 @@ try (QueryResponse response = client.query(sql).get(3, TimeUnit.SECONDS);) { Sends `sqlQuery` as is. Additionally will send query parameters so the server can compile the SQL expression. **Signatures** -```java +```java CompletableFuture query(String sqlQuery, Map queryParams, QuerySettings settings) ``` @@ -487,7 +476,7 @@ Future of `QueryResponse` type - a result dataset and additional information lik ```java showLineNumbers -// define parameters. They will be sent to the server along with the request. +// define parameters. They will be sent to the server along with the request. Map queryParams = new HashMap<>(); queryParams.put("param1", 2); @@ -500,7 +489,7 @@ try (QueryResponse queryResponse = while (reader.hasNext()) { reader.next(); // Read the next record from stream and parse it - // reading data + // reading data } } catch (Exception e) { @@ -514,7 +503,7 @@ try (QueryResponse queryResponse = Queries a data in `RowBinaryWithNamesAndTypes` format. Returns the result as a collection. Read performance is the same as with the reader but more memory is required to hold the whole dataset. **Signatures** -```java +```java List queryAll(String sqlQuery) ``` @@ -594,7 +583,7 @@ Fetches table schema for the `table`. **Signatures** -```java +```java TableSchema getTableSchema(String table) TableSchema getTableSchema(String table, String database) ``` @@ -615,7 +604,7 @@ Fetches schema from a SQL statement. **Signatures** -```java +```java TableSchema getTableSchemaFromQuery(String sql) ``` @@ -636,7 +625,7 @@ Column match is found by extracting its name from a method name. For example, `g **Signatures** -```java +```java void register(Class clazz, TableSchema schema) ``` @@ -646,10 +635,9 @@ void register(Class clazz, TableSchema schema) `schema` - Data schema to use for matching with POJO properties. - **Examples** -```java showLineNumbers +```java showLineNumbers client.register(ArticleViewEvent.class, client.getTableSchema(TABLE_NAME)); ``` diff --git a/docs/integrations/language-clients/java/index.md b/docs/integrations/language-clients/java/index.md index e684724db4f..4b09ae0f9ce 100644 --- a/docs/integrations/language-clients/java/index.md +++ b/docs/integrations/language-clients/java/index.md @@ -19,7 +19,7 @@ import CodeBlock from '@theme/CodeBlock'; Java client is a library implementing own API that abstracts details of network communications with ClickHouse server. Currently HTTP Interface is supported only. The library provide utilities to work with different ClickHouse formats and other related functions. -Java Client was developed far back in 2015. Its codebase became very hard to maintain, API is confusing, it is hard to optimize it further. So we have refactored it in 2024 into a new component `client-v2`. It has clear API, lighter codebase and more performance improvements, better ClickHouse formats support (RowBinary & Native mainly). JDBC will use this client in near feature. +Java Client was developed far back in 2015. Its codebase became very hard to maintain, API is confusing, it is hard to optimize it further. So we have refactored it in 2024 into a new component `client-v2`. It has clear API, lighter codebase and more performance improvements, better ClickHouse formats support (RowBinary & Native mainly). JDBC will use this client in near feature. ### Supported data types {#supported-data-types} @@ -76,15 +76,14 @@ Java Client was developed far back in 2015. Its codebase became very hard to mai |Dynamic |✔ |✗ | |JSON |✔ |✗ | - [ClickHouse Data Types](/sql-reference/data-types) :::note - AggregatedFunction - :warning: does not support `SELECT * FROM table ...` - Decimal - `SET output_format_decimal_trailing_zeros=1` in 21.9+ for consistency - Enum - can be treated as both string and integer -- UInt64 - mapped to `long` in client-v1 -::: +- UInt64 - mapped to `long` in client-v1 + ::: ### Features {#features} @@ -94,7 +93,7 @@ Table of features of the clients: |----------------------------------------------|:---------:|:---------:|:---------:| | Http Connection |✔ |✔ | | | Http Compression (LZ4) |✔ |✔ | | -| Server Response Compression - LZ4 |✔ |✔ | | +| Server Response Compression - LZ4 |✔ |✔ | | | Client Request Compression - LZ4 |✔ |✔ | | | HTTPS |✔ |✔ | | | Client SSL Cert (mTLS) |✔ |✔ | | @@ -111,7 +110,6 @@ Table of features of the clients: | SSL Client Authentication |✔ |✔ | | | Session timezone |✔ |✔ | | - JDBC Drive inherits same features as underlying client implementation. Other JDBC features are listed on its [page](/integrations/language-clients/java/jdbc). ### Compatibility {#compatibility} @@ -123,7 +121,7 @@ JDBC Drive inherits same features as underlying client implementation. Other JDB ### Logging {#logging} -Our Java language client uses [SLF4J](https://www.slf4j.org/) for logging. You can use any SLF4J-compatible logging framework, such as `Logback` or `Log4j`. +Our Java language client uses [SLF4J](https://www.slf4j.org/) for logging. You can use any SLF4J-compatible logging framework, such as `Logback` or `Log4j`. For example, if you are using Maven you could add the following dependency to your `pom.xml` file: ```xml title="pom.xml" diff --git a/docs/integrations/language-clients/java/jdbc/_snippets/_v0_7.mdx b/docs/integrations/language-clients/java/jdbc/_snippets/_v0_7.mdx index cb7c27b051a..b3a4f81d048 100644 --- a/docs/integrations/language-clients/java/jdbc/_snippets/_v0_7.mdx +++ b/docs/integrations/language-clients/java/jdbc/_snippets/_v0_7.mdx @@ -13,73 +13,62 @@ Latest JDBC (0.7.2) version uses Client-V1 - [OpenJDK](https://openjdk.java.net) version >= 8 - ### Setup {#setup} - - - ```xml - - - com.clickhouse - clickhouse-jdbc - 0.7.2 - - shaded-all - - ``` - - - - - ```kotlin - // https://mvnrepository.com/artifact/com.clickhouse/clickhouse-jdbc - // use uber jar with all dependencies included, change classifier to http for smaller jar - implementation("com.clickhouse:clickhouse-jdbc:0.7.2:shaded-all") - ``` - - - - ```groovy - // https://mvnrepository.com/artifact/com.clickhouse/clickhouse-jdbc - // use uber jar with all dependencies included, change classifier to http for smaller jar - implementation 'com.clickhouse:clickhouse-jdbc:0.7.2:shaded-all' - ``` - - + +```xml + + +com.clickhouse +clickhouse-jdbc +0.7.2 + +shaded-all + +``` + + +```kotlin +// https://mvnrepository.com/artifact/com.clickhouse/clickhouse-jdbc +// use uber jar with all dependencies included, change classifier to http for smaller jar +implementation("com.clickhouse:clickhouse-jdbc:0.7.2:shaded-all") +``` + + +```groovy +// https://mvnrepository.com/artifact/com.clickhouse/clickhouse-jdbc +// use uber jar with all dependencies included, change classifier to http for smaller jar +implementation 'com.clickhouse:clickhouse-jdbc:0.7.2:shaded-all' +``` + Since version `0.5.0`, we are using Apache HTTP Client that's packed the Client. Since there is not a shared version of the package, you need to add a logger as a dependency. - - - ```xml - - - org.slf4j - slf4j-api - 2.0.16 - - ``` - - - - - ```kotlin - // https://mvnrepository.com/artifact/org.slf4j/slf4j-api - implementation("org.slf4j:slf4j-api:2.0.16") - ``` - - - - ```groovy - // https://mvnrepository.com/artifact/org.slf4j/slf4j-api - implementation 'org.slf4j:slf4j-api:2.0.16' - ``` - - + +```xml + + +org.slf4j +slf4j-api +2.0.16 + +``` + + +```kotlin +// https://mvnrepository.com/artifact/org.slf4j/slf4j-api +implementation("org.slf4j:slf4j-api:2.0.16") +``` + + +```groovy +// https://mvnrepository.com/artifact/org.slf4j/slf4j-api +implementation 'org.slf4j:slf4j-api:2.0.16' +``` + ## Configuration {#configuration} @@ -92,20 +81,20 @@ Since version `0.5.0`, we are using Apache HTTP Client that's packed the Client. - `jdbc:ch:https://localhost` is same as `jdbc:clickhouse:http://localhost:8443?ssl=true&sslmode=STRICT` - `jdbc:ch:grpc://localhost` is same as `jdbc:clickhouse:grpc://localhost:9100` -**Connection Properties**: + **Connection Properties**: -| Property | Default | Description | -| ------------------------ | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `continueBatchOnError` | `false` | Whether to continue batch processing when error occurred | -| `createDatabaseIfNotExist` | `false` | Whether to create database if it does not exist | -| `custom_http_headers` | | comma separated custom http headers, for example: `User-Agent=client1,X-Gateway-Id=123` | -| `custom_http_params` | | comma separated custom http query parameters, for example: `extremes=0,max_result_rows=100` | -| `nullAsDefault` | `0` | `0` - treat null value as is and throw exception when inserting null into non-nullable column; `1` - treat null value as is and disable null-check for inserting; `2` - replace null to default value of corresponding data type for both query and insert | -| `jdbcCompliance` | `true` | Whether to support standard synchronous UPDATE/DELETE and fake transaction | -| `typeMappings` | | Customize mapping between ClickHouse data type and Java class, which will affect result of both [`getColumnType()`](https://docs.oracle.com/javase/8/docs/api/java/sql/ResultSetMetaData.html#getColumnType-int-) and [`getObject(Class<>?>`)](https://docs.oracle.com/javase/8/docs/api/java/sql/ResultSet.html#getObject-java.lang.String-java.lang.Class-). For example: `UInt128=java.lang.String,UInt256=java.lang.String` | -| `wrapperObject` | `false` | Whether [`getObject()`](https://docs.oracle.com/javase/8/docs/api/java/sql/ResultSet.html#getObject-int-) should return java.sql.Array / java.sql.Struct for Array / Tuple. | + | Property | Default | Description | + | ------------------------ | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `continueBatchOnError` | `false` | Whether to continue batch processing when error occurred | + | `createDatabaseIfNotExist` | `false` | Whether to create database if it does not exist | + | `custom_http_headers` | | comma separated custom http headers, for example: `User-Agent=client1,X-Gateway-Id=123` | + | `custom_http_params` | | comma separated custom http query parameters, for example: `extremes=0,max_result_rows=100` | + | `nullAsDefault` | `0` | `0` - treat null value as is and throw exception when inserting null into non-nullable column; `1` - treat null value as is and disable null-check for inserting; `2` - replace null to default value of corresponding data type for both query and insert | + | `jdbcCompliance` | `true` | Whether to support standard synchronous UPDATE/DELETE and fake transaction | + | `typeMappings` | | Customize mapping between ClickHouse data type and Java class, which will affect result of both [`getColumnType()`](https://docs.oracle.com/javase/8/docs/api/java/sql/ResultSetMetaData.html#getColumnType-int-) and [`getObject(Class<>?>`)](https://docs.oracle.com/javase/8/docs/api/java/sql/ResultSet.html#getObject-java.lang.String-java.lang.Class-). For example: `UInt128=java.lang.String,UInt256=java.lang.String` | + | `wrapperObject` | `false` | Whether [`getObject()`](https://docs.oracle.com/javase/8/docs/api/java/sql/ResultSet.html#getObject-int-) should return java.sql.Array / java.sql.Struct for Array / Tuple. | -Note: please refer to [JDBC specific configuration](https://github.com/ClickHouse/clickhouse-java/blob/main/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcConfig.java) for more. + Note: please refer to [JDBC specific configuration](https://github.com/ClickHouse/clickhouse-java/blob/main/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcConfig.java) for more. ## Supported data types {#supported-data-types} @@ -116,7 +105,7 @@ JDBC Driver supports same data formats as client library does. - Decimal - `SET output_format_decimal_trailing_zeros=1` in 21.9+ for consistency - Enum - can be treated as both string and integer - UInt64 - mapped to `long` (in client-v1) -::: + ::: ## Creating Connection {#creating-connection} @@ -148,19 +137,19 @@ try (Connection conn = dataSource.getConnection(...); :::note - Use `PreparedStatement` instead of `Statement` -::: + ::: -It's easier to use but slower performance compare to input function (see below): + It's easier to use but slower performance compare to input function (see below): -```java showLineNumbers -try (PreparedStatement ps = conn.prepareStatement("insert into mytable(* except (description))")) { + ```java showLineNumbers + try (PreparedStatement ps = conn.prepareStatement("insert into mytable(* except (description))")) { ps.setString(1, "test"); // id ps.setObject(2, LocalDateTime.now()); // timestamp ps.addBatch(); // parameters will be write into buffered stream immediately in binary format ... ps.executeBatch(); // stream everything on-hand into ClickHouse -} -``` + } + ``` ### With input table function {#with-input-table-function} @@ -224,21 +213,21 @@ try (ClickHouseConnection conn = newConnection(props); "insert into test_batch_input select id, name, value from input('id Int32, name Nullable(String), desc Nullable(String), value AggregateFunction(groupBitmap, UInt32)')")) { s.execute("drop table if exists test_batch_input;" + "create table test_batch_input(id Int32, name Nullable(String), value AggregateFunction(groupBitmap, UInt32))engine=Memory"); - Object[][] objs = new Object[][] { - new Object[] { 1, "a", "aaaaa", ClickHouseBitmap.wrap(1, 2, 3, 4, 5) }, - new Object[] { 2, "b", null, ClickHouseBitmap.wrap(6, 7, 8, 9, 10) }, - new Object[] { 3, null, "33333", ClickHouseBitmap.wrap(11, 12, 13) } - }; - for (Object[] v : objs) { - stmt.setInt(1, (int) v[0]); - stmt.setString(2, (String) v[1]); - stmt.setString(3, (String) v[2]); - stmt.setObject(4, v[3]); - stmt.addBatch(); - } - int[] results = stmt.executeBatch(); - ... -} + Object[][] objs = new Object[][] { + new Object[] { 1, "a", "aaaaa", ClickHouseBitmap.wrap(1, 2, 3, 4, 5) }, + new Object[] { 2, "b", null, ClickHouseBitmap.wrap(6, 7, 8, 9, 10) }, + new Object[] { 3, null, "33333", ClickHouseBitmap.wrap(11, 12, 13) } + }; + for (Object[] v : objs) { + stmt.setInt(1, (int) v[0]); + stmt.setString(2, (String) v[1]); + stmt.setString(3, (String) v[2]); + stmt.setObject(4, v[3]); + stmt.addBatch(); + } + int[] results = stmt.executeBatch(); + ... + } // use bitmap as query parameter try (PreparedStatement stmt = conn.prepareStatement( @@ -256,8 +245,6 @@ try (PreparedStatement stmt = conn.prepareStatement( } ``` -
    - ## Configuring HTTP library {#configuring-http-library} The ClickHouse JDBC connector supports three HTTP libraries: [`HttpClient`](https://docs.oracle.com/en/java/javase/11/docs/api/java.net.http/java/net/http/HttpClient.html), [`HttpURLConnection`](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/net/HttpURLConnection.html), and [Apache `HttpClient`](https://hc.apache.org/httpcomponents-client-5.2.x/). @@ -356,33 +343,33 @@ On Linux, the equivalent settings alone may not resolve the issue. Additional st 2. After modifying the kernel parameters, apply the changes by running the following command: -```shell -sudo sysctl -p - ``` + ```shell + sudo sysctl -p + ``` -After Setting those settings, you need to ensure that your client enables the Keep Alive option on the socket: + After Setting those settings, you need to ensure that your client enables the Keep Alive option on the socket: -```java -properties.setProperty("socket_keepalive", "true"); -``` + ```java + properties.setProperty("socket_keepalive", "true"); + ``` -:::note -Currently, you must use Apache HTTP Client library when setting the socket keep-alive, as the other two HTTP client libraries supported by `clickhouse-java` do not allow setting socket options. For a detailed guide, see [Configuring HTTP library](/integrations/language-clients/java/jdbc-v1#configuring-http-library). -::: + :::note + Currently, you must use Apache HTTP Client library when setting the socket keep-alive, as the other two HTTP client libraries supported by `clickhouse-java` do not allow setting socket options. For a detailed guide, see [Configuring HTTP library](/integrations/language-clients/java/jdbc-v1#configuring-http-library). + ::: -Alternatively, you can add equivalent parameters to the JDBC URL. + Alternatively, you can add equivalent parameters to the JDBC URL. -The default socket and connection timeout for the JDBC driver is 30 seconds. The timeout can be increased to support large data insert operations. Use the `options` method on `ClickHouseClient` together with the `SOCKET_TIMEOUT` and `CONNECTION_TIMEOUT` options as defined by `ClickHouseClientOption`: + The default socket and connection timeout for the JDBC driver is 30 seconds. The timeout can be increased to support large data insert operations. Use the `options` method on `ClickHouseClient` together with the `SOCKET_TIMEOUT` and `CONNECTION_TIMEOUT` options as defined by `ClickHouseClientOption`: -```java showLineNumbers -final int MS_12H = 12 * 60 * 60 * 1000; // 12 h in ms -final String sql = "insert into table_a (c1, c2, c3) select c1, c2, c3 from table_b;"; + ```java showLineNumbers + final int MS_12H = 12 * 60 * 60 * 1000; // 12 h in ms + final String sql = "insert into table_a (c1, c2, c3) select c1, c2, c3 from table_b;"; -try (ClickHouseClient client = ClickHouseClient.newInstance(ClickHouseProtocol.HTTP)) { + try (ClickHouseClient client = ClickHouseClient.newInstance(ClickHouseProtocol.HTTP)) { client.read(servers).write() .option(ClickHouseClientOption.SOCKET_TIMEOUT, MS_12H) .option(ClickHouseClientOption.CONNECTION_TIMEOUT, MS_12H) .query(sql) .executeAndWait(); -} -``` \ No newline at end of file + } + ``` diff --git a/docs/integrations/language-clients/java/jdbc/_snippets/_v0_8.mdx b/docs/integrations/language-clients/java/jdbc/_snippets/_v0_8.mdx index ff85dda35a9..7612a412a8b 100644 --- a/docs/integrations/language-clients/java/jdbc/_snippets/_v0_8.mdx +++ b/docs/integrations/language-clients/java/jdbc/_snippets/_v0_8.mdx @@ -28,34 +28,29 @@ In 0.8 we tried to make the driver more strictly follow the JDBC specification, ### Setup {#setup} - - - ```xml - - - com.clickhouse - clickhouse-jdbc - 0.8.2 - shaded-all - - ``` - - - - - ```kotlin - // https://mvnrepository.com/artifact/com.clickhouse/clickhouse-jdbc - implementation("com.clickhouse:clickhouse-jdbc:0.8.2:shaded-all") - ``` - - - - ```groovy - // https://mvnrepository.com/artifact/com.clickhouse/clickhouse-jdbc - implementation 'com.clickhouse:clickhouse-jdbc:0.8.2:shaded-all' - ``` - - + +```xml + + +com.clickhouse +clickhouse-jdbc +0.8.2 +shaded-all + +``` + + +```kotlin +// https://mvnrepository.com/artifact/com.clickhouse/clickhouse-jdbc +implementation("com.clickhouse:clickhouse-jdbc:0.8.2:shaded-all") +``` + + +```groovy +// https://mvnrepository.com/artifact/com.clickhouse/clickhouse-jdbc +implementation 'com.clickhouse:clickhouse-jdbc:0.8.2:shaded-all' +``` + ## Configuration {#configuration} @@ -67,17 +62,17 @@ In 0.8 we tried to make the driver more strictly follow the JDBC specification, - `jdbc:clickhouse:http://localhost:8123` - `jdbc:clickhouse:https://localhost:8443?ssl=true` -**Connection Properties**: + **Connection Properties**: -Beyond standard JDBC properties, the driver supports the ClickHouse-specific properties offered by the underlying [java client](/integrations/language-clients/java/client/client.mdx). -Where possible methods will return an `SQLFeatureNotSupportedException` if the feature is not supported. Other custom properties include: + Beyond standard JDBC properties, the driver supports the ClickHouse-specific properties offered by the underlying [java client](/integrations/language-clients/java/client/client.mdx). + Where possible methods will return an `SQLFeatureNotSupportedException` if the feature is not supported. Other custom properties include: -| Property | Default | Description | -|----------------------------------|---------|----------------------------------------------------------------| -| `disable_frameworks_detection` | `true` | Disable frameworks detection for User-Agent | -| `jdbc_ignore_unsupported_values` | `false` | Suppresses `SQLFeatureNotSupportedException` | -| `clickhouse.jdbc.v1` | `false` | Use older JDBC implementation instead of new JDBC | -| `default_query_settings` | `null` | Allows passing of default query settings with query operations | + | Property | Default | Description | + |----------------------------------|---------|----------------------------------------------------------------| + | `disable_frameworks_detection` | `true` | Disable frameworks detection for User-Agent | + | `jdbc_ignore_unsupported_values` | `false` | Suppresses `SQLFeatureNotSupportedException` | + | `clickhouse.jdbc.v1` | `false` | Use older JDBC implementation instead of new JDBC | + | `default_query_settings` | `null` | Allows passing of default query settings with query operations | ## Supported data types {#supported-data-types} @@ -172,7 +167,6 @@ try (HikariDataSource ds = new HikariDataSource(poolConfig); ## More Information {#more-information} For more information, see our [GitHub repository](https://github.com/ClickHouse/clickhouse-java) and [Java Client documentation](/integrations/language-clients/java/client/client.mdx). - ## Troubleshooting {#troubleshooting} ### Logging {#logging} The driver uses [slf4j](https://www.slf4j.org/) for logging, and will use the first available implementation on the `classpath`. @@ -202,23 +196,23 @@ On Linux, the equivalent settings alone may not resolve the issue. Additional st 1. Adjust the following Linux kernel parameters in `/etc/sysctl.conf` or a related configuration file: - - `net.inet.tcp.keepidle`: 60000 - - `net.inet.tcp.keepintvl`: 45000 - - `net.inet.tcp.keepinit`: 45000 - - `net.inet.tcp.keepcnt`: 8 - - `net.inet.tcp.always_keepalive`: 1 - - `net.ipv4.tcp_keepalive_intvl`: 75 - - `net.ipv4.tcp_keepalive_probes`: 9 - - `net.ipv4.tcp_keepalive_time`: 60 (You may consider lowering this value from the default 300 seconds) + - `net.inet.tcp.keepidle`: 60000 + - `net.inet.tcp.keepintvl`: 45000 + - `net.inet.tcp.keepinit`: 45000 + - `net.inet.tcp.keepcnt`: 8 + - `net.inet.tcp.always_keepalive`: 1 + - `net.ipv4.tcp_keepalive_intvl`: 75 + - `net.ipv4.tcp_keepalive_probes`: 9 + - `net.ipv4.tcp_keepalive_time`: 60 (You may consider lowering this value from the default 300 seconds) 2. After modifying the kernel parameters, apply the changes by running the following command: -```shell -sudo sysctl -p -``` + ```shell + sudo sysctl -p + ``` -After Setting those settings, you need to ensure that your client enables the Keep Alive option on the socket: + After Setting those settings, you need to ensure that your client enables the Keep Alive option on the socket: -```java -properties.setProperty("socket_keepalive", "true"); -``` + ```java + properties.setProperty("socket_keepalive", "true"); + ``` diff --git a/docs/integrations/language-clients/js.md b/docs/integrations/language-clients/js.md index 59c641a3e9c..459d2499a6e 100644 --- a/docs/integrations/language-clients/js.md +++ b/docs/integrations/language-clients/js.md @@ -21,9 +21,9 @@ There are two different versions of the client available for different environme - `@clickhouse/client` - Node.js only - `@clickhouse/client-web` - browsers (Chrome/Firefox), Cloudflare workers -When using TypeScript, make sure it is at least [version 4.5](https://www.typescriptlang.org/docs/handbook/release-notes/typescript-4-5.html), which enables [inline import and export syntax](https://www.typescriptlang.org/docs/handbook/release-notes/typescript-4-5.html#type-modifiers-on-import-names). + When using TypeScript, make sure it is at least [version 4.5](https://www.typescriptlang.org/docs/handbook/release-notes/typescript-4-5.html), which enables [inline import and export syntax](https://www.typescriptlang.org/docs/handbook/release-notes/typescript-4-5.html#type-modifiers-on-import-names). -The client source code is available in the [ClickHouse-JS GitHub repository](https://github.com/ClickHouse/clickhouse-js). + The client source code is available in the [ClickHouse-JS GitHub repository](https://github.com/ClickHouse/clickhouse-js). ## Environment requirements (node.js) {#environment-requirements-nodejs} @@ -71,7 +71,7 @@ Likely, the client will work with the older versions, too; however, this is best We aim to cover various scenarios of client usage with the [examples](https://github.com/ClickHouse/clickhouse-js/blob/main/examples) in the client repository. -The overview is available in the [examples README](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/README.md#overview). +The overview is available in the [examples README](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/README.md#overview). If something is unclear or missing from the examples or from the following documentation, feel free to [contact us](./js.md#contact-us). @@ -157,29 +157,29 @@ It is possible to configure most of the client instance parameters with a URL. T | `http_header_*` | see below (3) | | (Node.js only) `keep_alive_idle_socket_ttl` | non-negative number. | -- (1) For booleans, valid values will be `true`/`1` and `false`/`0`. +- (1) For booleans, valid values will be `true`/`1` and `false`/`0`. - (2) Any parameter prefixed with `clickhouse_setting_` or `ch_` will have this prefix removed and the rest added to client's `clickhouse_settings`. For example, `?ch_async_insert=1&ch_wait_for_async_insert=1` will be the same as: -```ts -createClient({ - clickhouse_settings: { + ```ts + createClient({ + clickhouse_settings: { async_insert: 1, wait_for_async_insert: 1, - }, -}) -``` + }, + }) + ``` -Note: boolean values for `clickhouse_settings` should be passed as `1`/`0` in the URL. + Note: boolean values for `clickhouse_settings` should be passed as `1`/`0` in the URL. - (3) Similar to (2), but for `http_header` configuration. For example, `?http_header_x-clickhouse-auth=foobar` will be an equivalent of: -```ts -createClient({ - http_headers: { + ```ts + createClient({ + http_headers: { 'x-clickhouse-auth': 'foobar', - }, -}) -``` + }, + }) + ``` ### Connecting {#connecting} @@ -210,7 +210,7 @@ The client repository contains multiple examples that use environment variables, #### Connection pool (Node.js only) {#connection-pool-nodejs-only} -To avoid the overhead of establishing a connection on every request, the client creates a pool of connections to ClickHouse to reuse, utilizing a Keep-Alive mechanism. By default, Keep-Alive is enabled, and the size of connection pool is set to `10`, but you can change it with `max_open_connections` [configuration option](./js.md#configuration). +To avoid the overhead of establishing a connection on every request, the client creates a pool of connections to ClickHouse to reuse, utilizing a Keep-Alive mechanism. By default, Keep-Alive is enabled, and the size of connection pool is set to `10`, but you can change it with `max_open_connections` [configuration option](./js.md#configuration). There is no guarantee the same connection in a pool will be used for subsequent queries unless the user sets `max_open_connections: 1`. This is rarely needed but may be required for cases where users are using temporary tables. @@ -283,9 +283,9 @@ Node.js `ResultSet` implementation uses `Stream.Readable` under the hood, while You can consume the `ResultSet` by calling either `text` or `json` methods on `ResultSet` and load the entire set of rows returned by the query into memory. -You should start consuming the `ResultSet` as soon as possible, as it holds the response stream open and consequently keeps the underlying connection busy. The client does not buffer the incoming data to avoid potential excessive memory usage by the application. +You should start consuming the `ResultSet` as soon as possible, as it holds the response stream open and consequently keeps the underlying connection busy. The client does not buffer the incoming data to avoid potential excessive memory usage by the application. -Alternatively, if it's too large to fit into memory at once, you can call the `stream` method, and process the data in the streaming mode. Each of the response chunks will be transformed into a relatively small arrays of rows instead (the size of this array depends on the size of a particular chunk the client receives from the server, as it may vary, and the size of an individual row), one chunk at a time. +Alternatively, if it's too large to fit into memory at once, you can call the `stream` method, and process the data in the streaming mode. Each of the response chunks will be transformed into a relatively small arrays of rows instead (the size of this array depends on the size of a particular chunk the client receives from the server, as it may vary, and the size of an individual row), one chunk at a time. Please refer to the list of the [supported data formats](./js.md#supported-data-formats) to determine what the best format is for streaming in your case. For example, if you want to stream JSON objects, you could choose [JSONEachRow](/sql-reference/formats#jsoneachrow), and each row will be parsed as a JS object, or, perhaps, a more compact [JSONCompactColumns](/sql-reference/formats#jsoncompactcolumns) format that will result in each row being a compact array of values. See also: [streaming files](./js.md#streaming-files-nodejs-only). @@ -323,7 +323,7 @@ interface Row { } ``` -**Example:** (Node.js/Web) A query with a resulting dataset in `JSONEachRow` format, consuming the entire stream and parsing the contents as JS objects. +**Example:** (Node.js/Web) A query with a resulting dataset in `JSONEachRow` format, consuming the entire stream and parsing the contents as JS objects. [Source code](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/array_json_each_row.ts). ```ts @@ -445,7 +445,7 @@ It can work with either a `Stream.Readable` or a plain `Array`, depending on Insert method is supposed to be awaited; however, it is possible to specify an input stream and await the `insert` operation later, only when the stream is completed (which will also resolve the `insert` promise). This could potentially be useful for event listeners and similar scenarios, but the error handling might be non-trivial with a lot of edge cases on the client side. Instead, consider using [async inserts](/optimize/asynchronous-inserts) as illustrated in [this example](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/async_insert_without_waiting.ts). :::tip -If you have a custom INSERT statement that is difficult to model with this method, consider using the [command method](./js.md#command-method). +If you have a custom INSERT statement that is difficult to model with this method, consider using the [command method](./js.md#command-method). You can see how it is used in the [INSERT INTO ... VALUES](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/insert_values_and_functions.ts) or [INSERT INTO ... SELECT](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/insert_from_select.ts) examples. ::: @@ -473,7 +473,7 @@ See also: [Base parameters for all client methods](./js.md#base-parameters-for-a A request canceled with `abort_signal` does not guarantee that data insertion did not take place, as the server could have received some of the streamed data before the cancellation. ::: -**Example:** (Node.js/Web) Insert an array of values. +**Example:** (Node.js/Web) Insert an array of values. [Source code](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/array_json_each_row.ts). ```ts @@ -555,7 +555,7 @@ await client.insert({ Currently, inserts in `@clickhouse/client-web` only work with `Array` and `JSON*` formats. Inserting streams is not supported in the web version yet due to poor browser compatibility. -Consequently, the `InsertParams` interface for the web version looks slightly different from the Node.js version, +Consequently, the `InsertParams` interface for the web version looks slightly different from the Node.js version, as `values` are limited to the `ReadonlyArray` type only: ```ts @@ -602,7 +602,7 @@ interface ClickHouseClient { See also: [Base parameters for all client methods](./js.md#base-parameters-for-all-client-methods). -**Example:** (Node.js/Web) Create a table in ClickHouse Cloud. +**Example:** (Node.js/Web) Create a table in ClickHouse Cloud. [Source code](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/create_table_cloud.ts). ```ts @@ -612,7 +612,7 @@ await client.command({ (id UInt64, name String) ORDER BY (id) `, - // Recommended for cluster usage to avoid situations where a query processing error occurred after the response code, + // Recommended for cluster usage to avoid situations where a query processing error occurred after the response code, // and HTTP headers were already sent to the client. // See https://clickhouse.com/docs/interfaces/http/#response-buffering clickhouse_settings: { @@ -621,7 +621,7 @@ await client.command({ }) ``` -**Example:** (Node.js/Web) Create a table in a self-hosted ClickHouse instance. +**Example:** (Node.js/Web) Create a table in a self-hosted ClickHouse instance. [Source code](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/create_table_single_node.ts). ```ts @@ -689,7 +689,7 @@ export interface QueryResult { ### Ping {#ping} -The `ping` method provided to check the connectivity status returns `true` if the server can be reached. +The `ping` method provided to check the connectivity status returns `true` if the server can be reached. If the server is unreachable, the underlying error is included in the result as well. @@ -715,7 +715,7 @@ if (!result.success) { } ``` -NB: due to `/ping` endpoint not implementing CORS, the web version uses a simple `SELECT 1` to achieve a similar result. +NB: due to `/ping` endpoint not implementing CORS, the web version uses a simple `SELECT 1` to achieve a similar result. ### Close (Node.js only) {#close-nodejs-only} @@ -734,8 +734,8 @@ There are several file streaming examples with popular data formats (NDJSON, CSV - [Streaming from a Parquet file](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/node/insert_file_stream_parquet.ts) - [Streaming into a Parquet file](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/node/select_parquet_as_file.ts) -Streaming other formats into a file should be similar to Parquet, -the only difference will be in the format used for `query` call (`JSONEachRow`, `CSV`, etc.) and the output file name. + Streaming other formats into a file should be similar to Parquet, + the only difference will be in the format used for `query` call (`JSONEachRow`, `CSV`, etc.) and the output file name. ## Supported data formats {#supported-data-formats} @@ -746,9 +746,9 @@ If you specify `format` as one of the JSON-family (`JSONEachRow`, `JSONCompactEa Data provided in the "raw" text formats (`CSV`, `TabSeparated` and `CustomSeparated` families) are sent over the wire without additional transformations. :::tip -There might be confusion between JSON as a general format and [ClickHouse JSON format](/sql-reference/formats#json). +There might be confusion between JSON as a general format and [ClickHouse JSON format](/sql-reference/formats#json). -The client supports streaming JSON objects with formats such as [JSONEachRow](/sql-reference/formats#jsoneachrow) (see the table overview for other streaming-friendly formats; see also the `select_streaming_` [examples in the client repository](https://github.com/ClickHouse/clickhouse-js/tree/main/examples/node)). +The client supports streaming JSON objects with formats such as [JSONEachRow](/sql-reference/formats#jsoneachrow) (see the table overview for other streaming-friendly formats; see also the `select_streaming_` [examples in the client repository](https://github.com/ClickHouse/clickhouse-js/tree/main/examples/node)). It's only that formats like [ClickHouse JSON](/sql-reference/formats#json) and a few others are represented as a single object in the response and cannot be streamed by the client. ::: @@ -786,7 +786,7 @@ For Parquet, the main use case for selects likely will be writing the resulting `JSONEachRowWithProgress` is an output-only format that supports progress reporting in the stream. See [this example](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/node/select_json_each_row_with_progress.ts) for more details. -The entire list of ClickHouse input and output formats is available +The entire list of ClickHouse input and output formats is available [here](/interfaces/formats). ## Supported ClickHouse data types {#supported-clickhouse-data-types} @@ -827,7 +827,7 @@ The related JS type is relevant for any `JSON*` formats except the ones that rep | MultiPolygon | ✔️ | Array<Polygon\> | | Map(K, V) | ✔️ | Record<K, V\> | -The entire list of supported ClickHouse formats is available +The entire list of supported ClickHouse formats is available [here](/sql-reference/data-types/). ### Date/Date32 types caveats {#datedate32-types-caveats} @@ -835,7 +835,7 @@ The entire list of supported ClickHouse formats is available Since the client inserts values without additional type conversion, `Date`/`Date32` type columns can only be inserted as strings. -**Example:** Insert a `Date` type value. +**Example:** Insert a `Date` type value. [Source code](https://github.com/ClickHouse/clickhouse-js/blob/ba387d7f4ce375a60982ac2d99cb47391cf76cec/__tests__/integration/date_time.test.ts) . @@ -950,7 +950,7 @@ client.query({ }) ``` -A type declaration file with all the supported ClickHouse settings can be found +A type declaration file with all the supported ClickHouse settings can be found [here](https://github.com/ClickHouse/clickhouse-js/blob/main/packages/client-common/src/settings.ts). :::important @@ -976,22 +976,22 @@ where: - `name` — Placeholder identifier. - `data_type` - [Data type](/sql-reference/data-types/) of the app parameter value. -**Example:**: Query with parameters. -[Source code](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/query_with_parameter_binding.ts) -. + **Example:**: Query with parameters. + [Source code](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/query_with_parameter_binding.ts) + . -```ts -await client.query({ - query: 'SELECT plus({val1: Int32}, {val2: Int32})', - format: 'CSV', - query_params: { + ```ts + await client.query({ + query: 'SELECT plus({val1: Int32}, {val2: Int32})', + format: 'CSV', + query_params: { val1: 10, val2: 20, - }, -}) -``` + }, + }) + ``` -Check https://clickhouse.com/docs/interfaces/cli#cli-queries-with-parameters-syntax for additional details. + Check https://clickhouse.com/docs/interfaces/cli#cli-queries-with-parameters-syntax for additional details. ### Compression {#compression} @@ -1068,7 +1068,7 @@ Currently, the client will log the following events: - `WARN` - non-fatal errors; failed `ping` request is logged as a warning, as the underlying error is included in the returned result - `ERROR` - fatal errors from `query`/`insert`/`exec`/`command` methods, such as a failed request -You can find the default Logger implementation [here](https://github.com/ClickHouse/clickhouse-js/blob/main/packages/client-common/src/logger.ts). + You can find the default Logger implementation [here](https://github.com/ClickHouse/clickhouse-js/blob/main/packages/client-common/src/logger.ts). ### TLS certificates (Node.js only) {#tls-certificates-nodejs-only} @@ -1140,12 +1140,12 @@ In this case, `keep_alive_timeout` is 10 seconds, and you could try increasing ` If you are experiencing `socket hang up` errors while using Keep-Alive, there are the following options to resolve this issue: -* Slightly reduce `keep_alive.idle_socket_ttl` setting in the ClickHouse server configuration. In certain situations, for example, high network latency between client and server, it could be beneficial to reduce `keep_alive.idle_socket_ttl` by another 200-500 milliseconds, ruling out the situation where an outgoing request could obtain a socket that the server is going to close. +* Slightly reduce `keep_alive.idle_socket_ttl` setting in the ClickHouse server configuration. In certain situations, for example, high network latency between client and server, it could be beneficial to reduce `keep_alive.idle_socket_ttl` by another 200-500 milliseconds, ruling out the situation where an outgoing request could obtain a socket that the server is going to close. * If this error is happening during long-running queries with no data coming in or out (for example, a long-running `INSERT FROM SELECT`), this might be due to the load balancer closing idling connections. You could try forcing some data coming in during long-running queries by using a combination of these ClickHouse settings: - ```ts - const client = createClient({ + ```ts + const client = createClient({ // Here we assume that we will have some queries with more than 5 minutes of execution time request_timeout: 400_000, /** These settings in combination allow to avoid LB timeout issues in case of long-running queries without data coming in or out, @@ -1155,21 +1155,21 @@ If you are experiencing `socket hang up` errors while using Keep-Alive, there ar send_progress_in_http_headers: 1, http_headers_progress_interval_ms: '110000', // UInt64, should be passed as a string }, - }) - ``` - Keep in mind, however, that the total size of the received headers has 16KB limit in recent Node.js versions; after certain amount of progress headers received, which was around 70-80 in our tests, an exception will be generated. + }) + ``` + Keep in mind, however, that the total size of the received headers has 16KB limit in recent Node.js versions; after certain amount of progress headers received, which was around 70-80 in our tests, an exception will be generated. - It is also possible to use an entirely different approach, avoiding wait time on the wire completely; it could be done by leveraging HTTP interface "feature" that mutations are not cancelled when the connection is lost. See [this example (part 2)](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/long_running_queries_timeouts.ts) for more details. + It is also possible to use an entirely different approach, avoiding wait time on the wire completely; it could be done by leveraging HTTP interface "feature" that mutations are not cancelled when the connection is lost. See [this example (part 2)](https://github.com/ClickHouse/clickhouse-js/blob/main/examples/long_running_queries_timeouts.ts) for more details. * Keep-Alive feature can be disabled entirely. In this case, client will also add `Connection: close` header to every request, and the underlying HTTP agent will not reuse the connections. `keep_alive.idle_socket_ttl` setting will be ignored, as there will be no idling sockets. This will result in additional overhead, as a new connection will be established for every request. - ```ts - const client = createClient({ + ```ts + const client = createClient({ keep_alive: { enabled: false, }, - }) - ``` + }) + ``` ### Read-only users {#read-only-users} diff --git a/docs/integrations/language-clients/python/index.md b/docs/integrations/language-clients/python/index.md index 3e88ff1dce6..b195bfb140a 100644 --- a/docs/integrations/language-clients/python/index.md +++ b/docs/integrations/language-clients/python/index.md @@ -15,32 +15,31 @@ import ConnectionDetails from '@site/docs/_snippets/_gather_your_details_http.md ClickHouse Connect is a core database driver providing interoperability with a wide range of Python applications. -- The main interface is the `Client` object in the package `clickhouse_connect.driver`. That core package -also includes assorted helper classes and utility functions used for communicating with the ClickHouse server and -"context" implementations for advanced management of insert and select queries. +- The main interface is the `Client` object in the package `clickhouse_connect.driver`. That core package + also includes assorted helper classes and utility functions used for communicating with the ClickHouse server and + "context" implementations for advanced management of insert and select queries. - The `clickhouse_connect.datatypes` package provides a base implementation and subclasses for all non-experimental -ClickHouse datatypes. Its primary functionality is serialization and deserialization of ClickHouse data into the -ClickHouse "Native" binary columnar format, used to achieve the most efficient transport between ClickHouse and client -applications. + ClickHouse datatypes. Its primary functionality is serialization and deserialization of ClickHouse data into the + ClickHouse "Native" binary columnar format, used to achieve the most efficient transport between ClickHouse and client + applications. - The Cython/C classes in the `clickhouse_connect.cdriver` package optimize some of the most common serializations and -deserializations for significantly improved performance over pure Python. + deserializations for significantly improved performance over pure Python. - There is a limited [SQLAlchemy](https://www.sqlalchemy.org/) dialect in the package `clickhouse_connect.cc_sqlalchemy` which is built off of -the `datatypes` and `dbi` packages. This restricted implementation focuses on query/cursor functionality, and does not -generally support SQLAlchemy DDL and ORM operations. (SQLAlchemy is targeted toward OLTP databases, and we recommend -more specialized tools and frameworks to manage the ClickHouse OLAP oriented database.) + the `datatypes` and `dbi` packages. This restricted implementation focuses on query/cursor functionality, and does not + generally support SQLAlchemy DDL and ORM operations. (SQLAlchemy is targeted toward OLTP databases, and we recommend + more specialized tools and frameworks to manage the ClickHouse OLAP oriented database.) - The core driver and ClickHouse Connect SQLAlchemy implementation are the preferred method for connecting ClickHouse -to Apache Superset. Use the `ClickHouse Connect` database connection, or `clickhousedb` SQLAlchemy dialect connection -string. + to Apache Superset. Use the `ClickHouse Connect` database connection, or `clickhousedb` SQLAlchemy dialect connection + string. + This documentation is current as of the beta release 0.8.2. -This documentation is current as of the beta release 0.8.2. - -:::note -The official ClickHouse Connect Python driver uses HTTP protocol for communication with the ClickHouse server. -It has some advantages (like better flexibility, HTTP-balancers support, better compatibility with JDBC-based tools, etc) -and disadvantages (like slightly lower compression and performance, and a lack of support for some complex features of the native TCP-based protocol). -For some use cases, you may consider using one of the [Community Python drivers](/interfaces/third-party/client-libraries.md) that uses native TCP-based protocol. -::: + :::note + The official ClickHouse Connect Python driver uses HTTP protocol for communication with the ClickHouse server. + It has some advantages (like better flexibility, HTTP-balancers support, better compatibility with JDBC-based tools, etc) + and disadvantages (like slightly lower compression and performance, and a lack of support for some complex features of the native TCP-based protocol). + For some use cases, you may consider using one of the [Community Python drivers](/interfaces/third-party/client-libraries.md) that uses native TCP-based protocol. + ::: ### Requirements and compatibility {#requirements-and-compatibility} @@ -53,7 +52,6 @@ For some use cases, you may consider using one of the [Community Python drivers] | 3.11.x | ✅ | Windows | ✅ | 24.8.x | ✅ | | | 2.1.x | ✅ | | 3.12.x | ✅ | | | 24.9.x | ✅ | | | 3.0.x | ✅ | - ¹ClickHouse Connect has been explicitly tested against the listed platforms. In addition, untested binary wheels (with C optimization) are built for all architectures supported by the excellent [`cibuildwheel``](https://cibuildwheel.readthedocs.io/en/stable/) project. Finally, because ClickHouse Connect can also run as pure Python, the source installation should work on any recent @@ -65,7 +63,6 @@ Python installation. protocol, it should also work correctly for most other versions of ClickHouse, although there may be some incompatibilities with certain advanced data types. - ### Installation {#installation} Install ClickHouse Connect from PyPI via pip: @@ -98,7 +95,6 @@ There are two examples shown for connecting to ClickHouse: ##### Use a ClickHouse Connect client instance to connect to a ClickHouse server on localhost: {#use-a-clickhouse-connect-client-instance-to-connect-to-a-clickhouse-server-on-localhost} - ```python import clickhouse_connect @@ -111,7 +107,6 @@ client = clickhouse_connect.get_client(host='localhost', username='default', pas Use the connection details gathered earlier. ClickHouse Cloud services require TLS, so use port 8443. ::: - ```python import clickhouse_connect @@ -143,7 +138,6 @@ result.result_rows Out[13]: [(2000, -50.9035)] ``` - ## ClickHouse Connect driver API {#clickhouse-connect-driver-api} ***Note:*** Passing keyword arguments is recommended for most api methods given the number of @@ -215,32 +209,32 @@ see [the ClickHouse documentation](/operations/settings/settings.md). #### Client creation examples {#client-creation-examples} - Without any parameters, a ClickHouse Connect client will connect to the default HTTP port on `localhost` with the - default user and no password: + default user and no password: -```python -import clickhouse_connect + ```python + import clickhouse_connect -client = clickhouse_connect.get_client() -client.server_version -Out[2]: '22.10.1.98' -``` + client = clickhouse_connect.get_client() + client.server_version + Out[2]: '22.10.1.98' + ``` - Connecting to a secure (https) external ClickHouse server -```python -import clickhouse_connect + ```python + import clickhouse_connect -client = clickhouse_connect.get_client(host='play.clickhouse.com', secure=True, port=443, user='play', password='clickhouse') -client.command('SELECT timezone()') -Out[2]: 'Etc/UTC' -``` + client = clickhouse_connect.get_client(host='play.clickhouse.com', secure=True, port=443, user='play', password='clickhouse') + client.command('SELECT timezone()') + Out[2]: 'Etc/UTC' + ``` - Connecting with a session id and other custom connection parameters and ClickHouse settings. -```python -import clickhouse_connect + ```python + import clickhouse_connect -client = clickhouse_connect.get_client(host='play.clickhouse.com', + client = clickhouse_connect.get_client(host='play.clickhouse.com', user='play', password='clickhouse', port=443, @@ -248,9 +242,9 @@ client = clickhouse_connect.get_client(host='play.clickhouse.com', connect_timeout=15, database='github', settings={'distributed_ddl_task_timeout':300}) -client.database -Out[2]: 'github' -``` + client.database + Out[2]: 'github' + ``` ### Common method arguments {#common-method-arguments} @@ -266,22 +260,22 @@ binding Python expressions to a ClickHouse value expression. Two sorts of bindin ClickHouse supports [server side binding](/interfaces/cli.md#cli-queries-with-parameters) for most query values, where the bound value is sent separate from the query as an HTTP query parameter. ClickHouse -Connect will add the appropriate query parameters if it detects a binding expression of the form +Connect will add the appropriate query parameters if it detects a binding expression of the form `{<name>:<datatype>}`. For server side binding, the `parameters` argument should be a Python dictionary. - Server Side Binding with Python Dictionary, DateTime value and string value -```python -import datetime + ```python + import datetime -my_date = datetime.datetime(2022, 10, 1, 15, 20, 5) + my_date = datetime.datetime(2022, 10, 1, 15, 20, 5) -parameters = {'table': 'my_table', 'v1': my_date, 'v2': "a string with a single quote'"} -client.query('SELECT * FROM {table:Identifier} WHERE date >= {v1:DateTime} AND string ILIKE {v2:String}', parameters=parameters) + parameters = {'table': 'my_table', 'v1': my_date, 'v2': "a string with a single quote'"} + client.query('SELECT * FROM {table:Identifier} WHERE date >= {v1:DateTime} AND string ILIKE {v2:String}', parameters=parameters) -# Generates the following query on the server -# SELECT * FROM my_table WHERE date >= '2022-10-01 15:20:05' AND string ILIKE 'a string with a single quote\'' -``` + # Generates the following query on the server + # SELECT * FROM my_table WHERE date >= '2022-10-01 15:20:05' AND string ILIKE 'a string with a single quote\'' + ``` **IMPORTANT** -- Server side binding is only supported (by the ClickHouse server) for `SELECT` queries. It does not work for `ALTER`, `DELETE`, `INSERT`, or other types of queries. This may change in the future, see https://github.com/ClickHouse/ClickHouse/issues/42092. @@ -300,12 +294,12 @@ need to be formatted differently (backticks or double quotes for database identi - Example with Python Dictionary, DateTime value and string escaping ```python -import datetime + import datetime -my_date = datetime.datetime(2022, 10, 1, 15, 20, 5) + my_date = datetime.datetime(2022, 10, 1, 15, 20, 5) -parameters = {'v1': my_date, 'v2': "a string with a single quote'"} -client.query('SELECT * FROM some_table WHERE date >= %(v1)s AND string ILIKE %(v2)s', parameters=parameters) + parameters = {'v1': my_date, 'v2': "a string with a single quote'"} + client.query('SELECT * FROM some_table WHERE date >= %(v1)s AND string ILIKE %(v2)s', parameters=parameters) # Generates the following query: # SELECT * FROM some_table WHERE date >= '2022-10-01 15:20:05' AND string ILIKE 'a string with a single quote\'' @@ -313,33 +307,33 @@ client.query('SELECT * FROM some_table WHERE date >= %(v1)s AND string ILIKE %(v - Example with Python Sequence (Tuple), Float64, and IPv4Address -```python -import ipaddress + ```python + import ipaddress -parameters = (35200.44, ipaddress.IPv4Address(0x443d04fe)) -client.query('SELECT * FROM some_table WHERE metric >= %s AND ip_address = %s', parameters=parameters) + parameters = (35200.44, ipaddress.IPv4Address(0x443d04fe)) + client.query('SELECT * FROM some_table WHERE metric >= %s AND ip_address = %s', parameters=parameters) -# Generates the following query: -# SELECT * FROM some_table WHERE metric >= 35200.44 AND ip_address = '68.61.4.254'' -``` + # Generates the following query: + # SELECT * FROM some_table WHERE metric >= 35200.44 AND ip_address = '68.61.4.254'' + ``` :::note To bind DateTime64 arguments (ClickHouse types with sub-second precision) requires one of two custom approaches: - Wrap the Python `datetime.datetime` value in the new DT64Param class, e.g. - ```python + ```python query = 'SELECT {p1:DateTime64(3)}' # Server side binding with dictionary parameters={'p1': DT64Param(dt_value)} - - query = 'SELECT %s as string, toDateTime64(%s,6) as dateTime' # Client side binding with list + + query = 'SELECT %s as string, toDateTime64(%s,6) as dateTime' # Client side binding with list parameters=['a string', DT64Param(datetime.now())] - ``` - - If using a dictionary of parameter values, append the string `_64` to the parameter name - ```python + ``` + - If using a dictionary of parameter values, append the string `_64` to the parameter name + ```python query = 'SELECT {p1:DateTime64(3)}, {a1:Array(DateTime(3))}' # Server side binding with dictionary - + parameters={'p1_64': dt_value, 'a1_64': [dt_value1, dt_value2]} - ``` -::: + ``` + ::: #### Settings argument {#settings-argument-1} @@ -376,22 +370,22 @@ a single primitive or array value rather than a full dataset. This method takes | external_data | ExternalData | *None* | An ExternalData object containing file or binary data to use with the query. See [Advanced Queries (External Data)](#external-data) | - _command_ can be used for DDL statements. If the SQL "command" does not return data, a "query summary" -dictionary is returned instead. This dictionary encapsulates the ClickHouse X-ClickHouse-Summary and -X-ClickHouse-Query-Id headers, including the key/value pairs `written_rows`,`written_bytes`, and `query_id`. + dictionary is returned instead. This dictionary encapsulates the ClickHouse X-ClickHouse-Summary and + X-ClickHouse-Query-Id headers, including the key/value pairs `written_rows`,`written_bytes`, and `query_id`. -```python -client.command('CREATE TABLE test_command (col_1 String, col_2 DateTime) Engine MergeTree ORDER BY tuple()') -client.command('SHOW CREATE TABLE test_command') -Out[6]: 'CREATE TABLE default.test_command\\n(\\n `col_1` String,\\n `col_2` DateTime\\n)\\nENGINE = MergeTree\\nORDER BY tuple()\\nSETTINGS index_granularity = 8192' -``` + ```python + client.command('CREATE TABLE test_command (col_1 String, col_2 DateTime) Engine MergeTree ORDER BY tuple()') + client.command('SHOW CREATE TABLE test_command') + Out[6]: 'CREATE TABLE default.test_command\\n(\\n `col_1` String,\\n `col_2` DateTime\\n)\\nENGINE = MergeTree\\nORDER BY tuple()\\nSETTINGS index_granularity = 8192' + ``` - _command_ can also be used for simple queries that return only a single row -```python -result = client.command('SELECT count() FROM system.tables') -result -Out[7]: 110 -``` + ```python + result = client.command('SELECT count() FROM system.tables') + result + Out[7]: 110 + ``` ### Client _query_ Method {#client-_query_-method} @@ -420,30 +414,30 @@ efficiently. This method takes the following parameters. The base `query` method returns a QueryResult object with the following public properties: - `result_rows` -- A matrix of the data returned in the form of a Sequence of rows, with each row element being a -sequence of column values. + sequence of column values. - `result_columns` -- A matrix of the data returned in the form of a Sequence of columns, with each column element being -a sequence of the row values for that column + a sequence of the row values for that column - `column_names` -- A tuple of strings representing the column names in the `result_set` - `column_types` -- A tuple of ClickHouseType instances representing the ClickHouse data type for each column in -the `result_columns` + the `result_columns` - `query_id` -- The ClickHouse query_id (useful for examining the query in the `system.query_log` table) - `summary` -- Any data returned by the `X-ClickHouse-Summary` HTTP response header - `first_item` -- A convenience property for retrieving the first row of the response as a dictionary (keys are column -names) + names) - `first_row` -- A convenience property to return the first row of the result - `column_block_stream` -- A generator of query results in column oriented format. This property should not be - referenced directly (see below). + referenced directly (see below). - `row_block_stream` -- A generator of query results in row oriented format. This property should not be referenced - directly (see below). + directly (see below). - `rows_stream` -- A generator of query results that yields a single row per invocation. This property should not be - referenced directly (see below). + referenced directly (see below). - `summary` -- As described under the `command` method, a dictionary of summary information returned by ClickHouse -The `*_stream` properties return a Python Context that can be used as an iterator for the returned data. They should -only be accessed indirectly using the Client `*_stream` methods. + The `*_stream` properties return a Python Context that can be used as an iterator for the returned data. They should + only be accessed indirectly using the Client `*_stream` methods. -The complete details of streaming query results (using StreamContext objects) are outlined in -[Advanced Queries (Streaming Queries)](#streaming-queries). + The complete details of streaming query results (using StreamContext objects) are outlined in + [Advanced Queries (Streaming Queries)](#streaming-queries). ### Consuming query results with NumPy, Pandas or Arrow {#consuming-query-results-with-numpy-pandas-or-arrow} @@ -452,9 +446,9 @@ There are three specialized versions of the main `query` method: - `query_np` -- This version returns a NumPy Array instead a ClickHouse Connect QueryResult. - `query_df` -- This version returns a Pandas Dataframe instead of a ClickHouse Connect QueryResult. - `query_arrow` -- This version returns a PyArrow Table. It utilizes the ClickHouse `Arrow` format directly, so - it only accepts three arguments in common with the main `query method`: `query`, `parameters`, and `settings`. In - addition, there is additional argument `use_strings` which determines whether the Arrow Table will render ClickHouse - String types as strings (if True) or bytes (if False). + it only accepts three arguments in common with the main `query method`: `query`, `parameters`, and `settings`. In + addition, there is additional argument `use_strings` which determines whether the Arrow Table will render ClickHouse + String types as strings (if True) or bytes (if False). ### Client streaming query methods {#client-streaming-query-methods} @@ -468,8 +462,8 @@ generator): - `query_df_stream` -- Returns each ClickHouse Block of query data as a Pandas Dataframe - `query_arrow_stream` -- Returns query data in PyArrow RecordBlocks -Each of these methods returns a `ContextStream` object that must be opened via a `with` statement to start consuming the -stream. See [Advanced Queries (Streaming Queries)](#streaming-queries) for details and examples. + Each of these methods returns a `ContextStream` object that must be opened via a `with` statement to start consuming the + stream. See [Advanced Queries (Streaming Queries)](#streaming-queries) for details and examples. ### Client _insert_ Method {#client-_insert_-method} @@ -493,14 +487,14 @@ This method returns a "query summary" dictionary as described under the "command There are two specialized versions of the main `insert` method: - `insert_df` -- Instead of Python Sequence of Sequences `data` argument, the second parameter of this method requires -a `df`argument that must be a Pandas Dataframe instance. ClickHouse Connect automatically processes the Dataframe as a -column oriented datasource, so the `column_oriented` parameter is not required or available. + a `df`argument that must be a Pandas Dataframe instance. ClickHouse Connect automatically processes the Dataframe as a + column oriented datasource, so the `column_oriented` parameter is not required or available. - `insert_arrow` -- Instead of a Python Sequence of Sequences `data` argument, this method requires an `arrow_table`. -ClickHouse Connect passes the Arrow table unmodified to the ClickHouse server for processing, so only the `database` -and `settings` arguments are available in addition to `table` and `arrow_table`. + ClickHouse Connect passes the Arrow table unmodified to the ClickHouse server for processing, so only the `database` + and `settings` arguments are available in addition to `table` and `arrow_table`. -*Note:* A NumPy array is a valid Sequence of Sequences and can be used as the `data` argument to the main `insert` -method, so a specialized method is not required. + *Note:* A NumPy array is a valid Sequence of Sequences and can be used as the `data` argument to the main `insert` + method, so a specialized method is not required. ### File Inserts {#file-inserts} @@ -640,10 +634,10 @@ the second is the HTTP connection pool used by ClickHouse Connect Client instanc ### Asyncclient wrapper {#asyncclient-wrapper} -Since 0.7.16, ClickHouse Connect provides an async wrapper over the regular `Client`, +Since 0.7.16, ClickHouse Connect provides an async wrapper over the regular `Client`, so that it is possible to use the client in an `asyncio` environment. -To get an instance of the `AsyncClient`, you could use the `get_async_client` factory function, +To get an instance of the `AsyncClient`, you could use the `get_async_client` factory function, which accepts the same parameters as the standard `get_client`: ```python @@ -651,22 +645,20 @@ import asyncio import clickhouse_connect - async def main(): client = await clickhouse_connect.get_async_client() result = await client.query("SELECT name FROM system.databases LIMIT 1") print(result.result_rows) - asyncio.run(main()) ``` -`AsyncClient` has the same methods with the same parameters as the standard `Client`, but they are coroutines when -applicable. Internally, these methods from the `Client` that perform I/O operations are wrapped in a -[run_in_executor](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor) call. +`AsyncClient` has the same methods with the same parameters as the standard `Client`, but they are coroutines when +applicable. Internally, these methods from the `Client` that perform I/O operations are wrapped in a +[run_in_executor](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor) call. -Multithreaded performance will increase when using the `AsyncClient` wrapper, -as the execution threads and the GIL will be released while waiting for I/O operations to complete. +Multithreaded performance will increase when using the `AsyncClient` wrapper, +as the execution threads and the GIL will be released while waiting for I/O operations to complete. Note: unlike the regular `Client`, the `AsyncClient` enforces the `autogenerate_session_id` to be `False` by default. @@ -677,32 +669,32 @@ See also: [run_async example](https://github.com/ClickHouse/clickhouse-connect/b Each ClickHouse query occurs within the context of a ClickHouse "session". Sessions are currently used for two purposes: - To associate specific ClickHouse settings with multiple queries (see the -[user settings](/operations/settings/settings.md)). The ClickHouse `SET` command is used to change the -settings for the scope of a user session. + [user settings](/operations/settings/settings.md)). The ClickHouse `SET` command is used to change the + settings for the scope of a user session. - To track [temporary tables.](/sql-reference/statements/create/table#temporary-tables) -By default, each query executed with a ClickHouse Connect Client instance uses the same session id to enable this -session functionality. That is, `SET` statements and temporary table work as expected when using a single ClickHouse -client. However, by design the ClickHouse server does not allow concurrent queries within the same session. -As a result, there are two options for a ClickHouse Connect application that will execute concurrent queries. + By default, each query executed with a ClickHouse Connect Client instance uses the same session id to enable this + session functionality. That is, `SET` statements and temporary table work as expected when using a single ClickHouse + client. However, by design the ClickHouse server does not allow concurrent queries within the same session. + As a result, there are two options for a ClickHouse Connect application that will execute concurrent queries. - Create a separate `Client` instance for each thread of execution (thread, process, or event handler) that will -have its own session id. This is generally the best approach, as it preserves the session state for each client. + have its own session id. This is generally the best approach, as it preserves the session state for each client. - Use a unique session id for each query. This avoids the concurrent session problem in circumstances where -temporary tables or shared session settings are not required. (Shared settings can also be provided -when creating the client, but these are sent with each request and not associated with a session). The unique -session_id can be added to the `settings` dictionary for each request, or you can disable the -`autogenerate_session_id` common setting: + temporary tables or shared session settings are not required. (Shared settings can also be provided + when creating the client, but these are sent with each request and not associated with a session). The unique + session_id can be added to the `settings` dictionary for each request, or you can disable the + `autogenerate_session_id` common setting: -```python -from clickhouse_connect import common + ```python + from clickhouse_connect import common -common.set_setting('autogenerate_session_id', False) # This should always be set before creating a client -client = clickhouse_connect.get_client(host='somehost.com', user='dbuser', password=1234) -``` + common.set_setting('autogenerate_session_id', False) # This should always be set before creating a client + client = clickhouse_connect.get_client(host='somehost.com', user='dbuser', password=1234) + ``` -In this case ClickHouse Connect will not send any session id, and a random session id will be generated by the -ClickHouse server. Again, temporary tables and session level settings will not be available. + In this case ClickHouse Connect will not send any session id, and a random session id will be generated by the + ClickHouse server. Again, temporary tables and session level settings will not be available. ### Customizing the HTTP connection pool {#customizing-the-http-connection-pool} @@ -770,14 +762,14 @@ in a similar form.) The size of a block returned from a query is governed by tw - [max_block_size](/operations/settings/settings#max_block_size) -- Limit on the size of the block in rows. Default 65536. - [preferred_block_size_bytes](/operations/settings/settings#preferred_block_size_bytes) -- Soft limit on the size of the block in bytes. Default 1,000,0000. -Regardless of the `preferred_block_size_setting`, each block will never be more than `max_block_size` rows. Depending on the -type of query, the actual blocks returned can be of any size. For example, queries to a distributed table covering many shards -may contain smaller blocks retrieved directly from each shard. + Regardless of the `preferred_block_size_setting`, each block will never be more than `max_block_size` rows. Depending on the + type of query, the actual blocks returned can be of any size. For example, queries to a distributed table covering many shards + may contain smaller blocks retrieved directly from each shard. -When using one of the Client `query_*_stream` methods, results are returned on a block by block basis. ClickHouse Connect only -loads a single block at a time. This allows processing large amounts of data without the need to load all of a large result -set into memory. Note the application should be prepared to process any number of blocks and the exact size of each block -cannot be controlled. + When using one of the Client `query_*_stream` methods, results are returned on a block by block basis. ClickHouse Connect only + loads a single block at a time. This allows processing large amounts of data without the need to load all of a large result + set into memory. Note the application should be prepared to process any number of blocks and the exact size of each block + cannot be controlled. #### HTTP data buffer for slow processing {#http-data-buffer-for-slow-processing} @@ -854,29 +846,29 @@ The "data type" argument for any formatting function can include wildcards. The Read formats can be set at several levels: - Globally, using the methods defined in the `clickhouse_connect.datatypes.format` package. This will control the format of the - configured datatype for all queries. -```python -from clickhouse_connect.datatypes.format import set_read_format - -# Return both IPv6 and IPv4 values as strings -set_read_format('IPv*', 'string') - -# Return all Date types as the underlying epoch second or epoch day -set_read_format('Date*', 'int') -``` -- For an entire query, using the optional `query_formats` dictionary argument. In that case any column (or subcolumn) of the - specified data types(s) will use the configured format. -```python -# Return any UUID column as a string -client.query('SELECT user_id, user_uuid, device_uuid from users', query_formats={'UUID': 'string'}) -``` -- For the values in a specific column, using the optional `column_formats` dictionary argument. The key is the column named as - return by ClickHouse, and format for the data column or a second level "format" dictionary of a ClickHouse type name and a value - of query formats. This secondary dictionary can be used for nested column types such as Tuples or Maps. -```python -# Return IPv6 values in the `dev_address` column as strings -client.query('SELECT device_id, dev_address, gw_address from devices', column_formats={'dev_address':'string'}) -``` + configured datatype for all queries. + ```python + from clickhouse_connect.datatypes.format import set_read_format + + # Return both IPv6 and IPv4 values as strings + set_read_format('IPv*', 'string') + + # Return all Date types as the underlying epoch second or epoch day + set_read_format('Date*', 'int') + ``` + - For an entire query, using the optional `query_formats` dictionary argument. In that case any column (or subcolumn) of the + specified data types(s) will use the configured format. + ```python + # Return any UUID column as a string + client.query('SELECT user_id, user_uuid, device_uuid from users', query_formats={'UUID': 'string'}) + ``` + - For the values in a specific column, using the optional `column_formats` dictionary argument. The key is the column named as + return by ClickHouse, and format for the data column or a second level "format" dictionary of a ClickHouse type name and a value + of query formats. This secondary dictionary can be used for nested column types such as Tuples or Maps. + ```python + # Return IPv6 values in the `dev_address` column as strings + client.query('SELECT device_id, dev_address, gw_address from devices', column_formats={'dev_address':'string'}) + ``` #### Read format options (Python types) {#read-format-options-python-types} @@ -905,7 +897,6 @@ client.query('SELECT device_id, dev_address, gw_address from devices', column_fo | Variant | object | - | Returns the matching Python type for the ClickHouse datatype stored for the value | | Dynamic | object | - | Returns the matching Python type for the ClickHouse datatype stored for the value | - ### External data {#external-data} ClickHouse queries can accept external data in any ClickHouse format. This binary data is sent along with the query string to be used to process the data. Details of @@ -915,7 +906,7 @@ for that object accepts the follow arguments: | Name | Type | Description | |-----------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------| -| file_path | str | Path to a file on the local system path to read the external data from. Either `file_path` or `data` is required | +| file_path | str | Path to a file on the local system path to read the external data from. Either `file_path` or `data` is required | | file_name | str | The name of the external data "file". If not provided, will be determined from the `file_path` (without extensions) | | data | bytes | The external data in binary form (instead of being read from a file). Either `data` or `file_path` is required | | fmt | str | The ClickHouse [Input Format](/sql-reference/formats.mdx) of the data. Defaults to `TSV` | @@ -923,7 +914,6 @@ for that object accepts the follow arguments: | structure | str or seq of str | A list of column name + data type in the data (see examples). Either `structure` or `types` is required | | mime_type | str | Optional MIME type of the file data. Currently ClickHouse ignores this HTTP subheader | - To send a query with an external CSV file containing "movie" data, and combine that data with an `directors` table already present on the ClickHouse server: ```python @@ -953,13 +943,13 @@ precedence rules: 1. If the query method parameter `client_tzs` is specified for the query, the specific column time zone is applied 2. If the ClickHouse column has timezone metadata (i.e., it is a type like DateTime64(3, 'America/Denver')), the ClickHouse column timezone is applied. (Note this -timezone metadata is not available to clickhouse-connect for DateTime columns previous to ClickHouse version 23.2) + timezone metadata is not available to clickhouse-connect for DateTime columns previous to ClickHouse version 23.2) 3. If the query method parameter `query_tz` is specified for the query, the "query timezone" is applied. 4. If a timezone setting is applied to the query or session, that timezone is applied. (This functionality is not yet released in the ClickHouse Server) 5. Finally, if the client `apply_server_timezone` parameter has been set to True (the default), the ClickHouse server timezone is applied. -Note that if the applied timezone based on these rules is UTC, `clickhouse-connect` will _always_ return a time zone naive Python `datetime.datetime` object. Additional timezone -information can then be added to this timezone naive object by the application code if desired. + Note that if the applied timezone based on these rules is UTC, `clickhouse-connect` will _always_ return a time zone naive Python `datetime.datetime` object. Additional timezone + information can then be added to this timezone naive object by the application code if desired. ## Inserting data with ClickHouse Connect: Advanced usage {#inserting-data-with-clickhouse-connect--advanced-usage} @@ -989,7 +979,6 @@ assert qr[0][0] == 4 InsertContexts include mutable state that is updated during the insert process, so they are not thread safe. - ### Write formats {#write-formats} Write formats are currently implemented for limited number of types. In most cases ClickHouse Connect will attempt to automatically determine the correct write format for a column by checking the type of the first (non-null) data value. @@ -1026,7 +1015,6 @@ In most cases, it is unnecessary to override the write format for a data type, b | Variant | object | | At this time on all variants are inserted as Strings and parsed by the ClickHouse server | | Dynamic | object | | Warning -- at this time any inserts into a Dynamic column are persisted as a ClickHouse String | - ## Additional options {#additional-options} ClickHouse Connect provides a number of additional options for advanced use cases @@ -1117,7 +1105,7 @@ ClickHouse types Variant, Dynamic, and JSON. #### Usage notes {#usage-notes} - JSON data can be inserted as either a Python dictionary or a JSON string containing a JSON object `{}`. Other - forms of JSON data are not supported + forms of JSON data are not supported - Queries using subcolumns/paths for these types will return the type of the sub column. - See the main ClickHouse documentation for other usage notes @@ -1126,8 +1114,8 @@ ClickHouse types Variant, Dynamic, and JSON. - The "new" JSON type is available started with the ClickHouse 24.8 release - Due to internal format changes, `clickhouse-connect` is only compatible with Variant types beginning with the ClickHouse 24.7 release - Returned JSON objects will only return the `max_dynamic_paths` number of elements (which defaults to 1024). This - will be fixed in a future release. + will be fixed in a future release. - Inserts into `Dynamic` columns will always be the String representation of the Python value. This will be fixed - in a future release, once https://github.com/ClickHouse/ClickHouse/issues/70395 has been fixed. + in a future release, once https://github.com/ClickHouse/ClickHouse/issues/70395 has been fixed. - The implementation for the new types has not been optimized in C code, so performance may be somewhat slower than for - simpler, established data types. + simpler, established data types. diff --git a/docs/integrations/language-clients/rust.md b/docs/integrations/language-clients/rust.md index 98c72902287..71b1480c500 100644 --- a/docs/integrations/language-clients/rust.md +++ b/docs/integrations/language-clients/rust.md @@ -47,16 +47,16 @@ See also: [crates.io page](https://crates.io/crates/clickhouse). * `uuid` — adds `serde::uuid` to work with [uuid](https://docs.rs/uuid) crate. * `time` — adds `serde::time` to work with [time](https://docs.rs/time) crate. -:::important -When connecting to ClickHouse via an `HTTPS` url, either the `native-tls` or `rustls-tls` feature should be enabled. -If both are enabled, the `rustls-tls` feature will take precedence. -::: + :::important + When connecting to ClickHouse via an `HTTPS` url, either the `native-tls` or `rustls-tls` feature should be enabled. + If both are enabled, the `rustls-tls` feature will take precedence. + ::: ## ClickHouse versions compatibility {#clickhouse-versions-compatibility} The client is compatible with the LTS or newer versions of ClickHouse, as well as ClickHouse Cloud. -ClickHouse server older than v22.6 handles RowBinary [incorrectly in some rare cases](https://github.com/ClickHouse/ClickHouse/issues/37420). +ClickHouse server older than v22.6 handles RowBinary [incorrectly in some rare cases](https://github.com/ClickHouse/ClickHouse/issues/37420). You could use v0.11+ and enable `wa-37420` feature to solve this problem. Note: this feature should not be used with newer ClickHouse versions. ## Examples {#examples} @@ -109,7 +109,7 @@ let client = Client::default() .with_password(read_env_var("CLICKHOUSE_PASSWORD")); ``` -See also: +See also: - [HTTPS with ClickHouse Cloud example](https://github.com/ClickHouse/clickhouse-rs/blob/main/examples/clickhouse_cloud.rs) in the client repo. This should be applicable to on-premise HTTPS connections as well. ### Selecting rows {#selecting-rows} @@ -141,11 +141,11 @@ while let Some(row) = cursor.next().await? { .. } * Convenient `fetch_one::()` and `fetch_all::()` methods can be used to get a first row or all rows, correspondingly. * `sql::Identifier` can be used to bind table names. -NB: as the entire response is streamed, cursors can return an error even after producing some rows. If this happens in your use case, you could try `query(...).with_option("wait_end_of_query", "1")` in order to enable response buffering on the server-side. [More details](/interfaces/http/#response-buffering). The `buffer_size` option can be useful, too. + NB: as the entire response is streamed, cursors can return an error even after producing some rows. If this happens in your use case, you could try `query(...).with_option("wait_end_of_query", "1")` in order to enable response buffering on the server-side. [More details](/interfaces/http/#response-buffering). The `buffer_size` option can be useful, too. -:::warning -Use `wait_end_of_query` with caution when selecting rows, as it can will to higher memory consumption on the server side and will likely decrease the overall performance. -::: + :::warning + Use `wait_end_of_query` with caution when selecting rows, as it can will to higher memory consumption on the server side and will likely decrease the overall performance. + ::: ### Inserting rows {#inserting-rows} @@ -215,12 +215,12 @@ inserter.end().await?; * Time thresholds implemented by using [quanta](https://docs.rs/quanta) crate to speed the `inserter` up. Not used if `test-util` is enabled (thus, time can be managed by `tokio::time::advance()` in custom tests). * All rows between `commit()` calls are inserted in the same `INSERT` statement. -:::warning -Do not forget to flush if you want to terminate/finalize inserting: -```rust -inserter.end().await?; -``` -::: + :::warning + Do not forget to flush if you want to terminate/finalize inserting: + ```rust + inserter.end().await?; + ``` + ::: ### Executing DDLs {#executing-ddls} @@ -338,7 +338,7 @@ See also: [custom HTTP client example](https://github.com/ClickHouse/clickhouse- See also the additional examples: * [Simpler ClickHouse data types](https://github.com/ClickHouse/clickhouse-rs/blob/main/examples/data_types_derive_simple.rs) * [Container-like ClickHouse data types](https://github.com/ClickHouse/clickhouse-rs/blob/main/examples/data_types_derive_containers.rs) -::: + ::: * `(U)Int(8|16|32|64|128)` maps to/from corresponding `(u|i)(8|16|32|64|128)` types or newtypes around them. * `(U)Int256` are not supported directly, but there is [a workaround for it](https://github.com/ClickHouse/clickhouse-rs/issues/48). @@ -347,100 +347,100 @@ See also the additional examples: * `Boolean` maps to/from `bool` or newtypes around it. * `String` maps to/from any string or bytes types, e.g. `&str`, `&[u8]`, `String`, `Vec` or [`SmartString`](https://docs.rs/smartstring/latest/smartstring/struct.SmartString.html). New types are also supported. To store bytes, consider using [`serde_bytes`](https://docs.rs/serde_bytes/latest/serde_bytes/), because it's more efficient. -```rust -#[derive(Row, Debug, Serialize, Deserialize)] -struct MyRow<'a> { + ```rust + #[derive(Row, Debug, Serialize, Deserialize)] + struct MyRow<'a> { str: &'a str, string: String, #[serde(with = "serde_bytes")] bytes: Vec, #[serde(with = "serde_bytes")] byte_slice: &'a [u8], -} -``` + } + ``` * `FixedString(N)` is supported as an array of bytes, e.g. `[u8; N]`. -```rust -#[derive(Row, Debug, Serialize, Deserialize)] -struct MyRow { + ```rust + #[derive(Row, Debug, Serialize, Deserialize)] + struct MyRow { fixed_str: [u8; 16], // FixedString(16) -} -``` + } + ``` * `Enum(8|16)` are supported using [`serde_repr`](https://docs.rs/serde_repr/latest/serde_repr/). -```rust -use serde_repr::{Deserialize_repr, Serialize_repr}; + ```rust + use serde_repr::{Deserialize_repr, Serialize_repr}; -#[derive(Row, Serialize, Deserialize)] -struct MyRow { + #[derive(Row, Serialize, Deserialize)] + struct MyRow { level: Level, -} + } -#[derive(Debug, Serialize_repr, Deserialize_repr)] -#[repr(u8)] -enum Level { + #[derive(Debug, Serialize_repr, Deserialize_repr)] + #[repr(u8)] + enum Level { Debug = 1, Info = 2, Warn = 3, Error = 4, -} -``` + } + ``` * `UUID` maps to/from [`uuid::Uuid`](https://docs.rs/uuid/latest/uuid/struct.Uuid.html) by using `serde::uuid`. Requires the `uuid` feature. -```rust -#[derive(Row, Serialize, Deserialize)] -struct MyRow { + ```rust + #[derive(Row, Serialize, Deserialize)] + struct MyRow { #[serde(with = "clickhouse::serde::uuid")] uuid: uuid::Uuid, -} -``` + } + ``` * `IPv6` maps to/from [`std::net::Ipv6Addr`](https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html). * `IPv4` maps to/from [`std::net::Ipv4Addr`](https://doc.rust-lang.org/stable/std/net/struct.Ipv4Addr.html) by using `serde::ipv4`. -```rust -#[derive(Row, Serialize, Deserialize)] -struct MyRow { + ```rust + #[derive(Row, Serialize, Deserialize)] + struct MyRow { #[serde(with = "clickhouse::serde::ipv4")] ipv4: std::net::Ipv4Addr, -} -``` + } + ``` * `Date` maps to/from `u16` or a newtype around it and represents a number of days elapsed since `1970-01-01`. Also, [`time::Date`](https://docs.rs/time/latest/time/struct.Date.html) is supported by using `serde::time::date`, that requires the `time` feature. -```rust -#[derive(Row, Serialize, Deserialize)] -struct MyRow { + ```rust + #[derive(Row, Serialize, Deserialize)] + struct MyRow { days: u16, #[serde(with = "clickhouse::serde::time::date")] date: Date, -} -``` + } + ``` * `Date32` maps to/from `i32` or a newtype around it and represents a number of days elapsed since `1970-01-01`. Also, [`time::Date`](https://docs.rs/time/latest/time/struct.Date.html) is supported by using `serde::time::date32`, that requires the `time` feature. -```rust -#[derive(Row, Serialize, Deserialize)] -struct MyRow { + ```rust + #[derive(Row, Serialize, Deserialize)] + struct MyRow { days: i32, #[serde(with = "clickhouse::serde::time::date32")] date: Date, -} -``` + } + ``` * `DateTime` maps to/from `u32` or a newtype around it and represents a number of seconds elapsed since UNIX epoch. Also, [`time::OffsetDateTime`](https://docs.rs/time/latest/time/struct.OffsetDateTime.html) is supported by using `serde::time::datetime`, that requires the `time` feature. -```rust -#[derive(Row, Serialize, Deserialize)] -struct MyRow { + ```rust + #[derive(Row, Serialize, Deserialize)] + struct MyRow { ts: u32, #[serde(with = "clickhouse::serde::time::datetime")] dt: OffsetDateTime, -} -``` + } + ``` * `DateTime64(_)` maps to/from `i32` or a newtype around it and represents a time elapsed since UNIX epoch. Also, [`time::OffsetDateTime`](https://docs.rs/time/latest/time/struct.OffsetDateTime.html) is supported by using `serde::time::datetime64::*`, that requires the `time` feature. -```rust -#[derive(Row, Serialize, Deserialize)] -struct MyRow { + ```rust + #[derive(Row, Serialize, Deserialize)] + struct MyRow { ts: i64, // elapsed s/us/ms/ns depending on `DateTime64(X)` #[serde(with = "clickhouse::serde::time::datetime64::secs")] dt64s: OffsetDateTime, // `DateTime64(0)` @@ -450,8 +450,8 @@ struct MyRow { dt64us: OffsetDateTime, // `DateTime64(6)` #[serde(with = "clickhouse::serde::time::datetime64::nanos")] dt64ns: OffsetDateTime, // `DateTime64(9)` -} -``` + } + ``` * `Tuple(A, B, ...)` maps to/from `(A, B, ...)` or a newtype around it. * `Array(_)` maps to/from any slice, e.g. `Vec<_>`, `&[_]`. New types are also supported. @@ -459,43 +459,43 @@ struct MyRow { * `LowCardinality(_)` is supported seamlessly. * `Nullable(_)` maps to/from `Option<_>`. For `clickhouse::serde::*` helpers add `::option`. -```rust -#[derive(Row, Serialize, Deserialize)] -struct MyRow { + ```rust + #[derive(Row, Serialize, Deserialize)] + struct MyRow { #[serde(with = "clickhouse::serde::ipv4::option")] ipv4_opt: Option, -} -``` + } + ``` * `Nested` is supported by providing multiple arrays with renaming. -```rust -// CREATE TABLE test(items Nested(name String, count UInt32)) -#[derive(Row, Serialize, Deserialize)] -struct MyRow { + ```rust + // CREATE TABLE test(items Nested(name String, count UInt32)) + #[derive(Row, Serialize, Deserialize)] + struct MyRow { #[serde(rename = "items.name")] items_name: Vec, #[serde(rename = "items.count")] items_count: Vec, -} -``` + } + ``` * `Geo` types are supported. `Point` behaves like a tuple `(f64, f64)`, and the rest of the types are just slices of points. -```rust -type Point = (f64, f64); -type Ring = Vec; -type Polygon = Vec; -type MultiPolygon = Vec; -type LineString = Vec; -type MultiLineString = Vec; - -#[derive(Row, Serialize, Deserialize)] -struct MyRow { + ```rust + type Point = (f64, f64); + type Ring = Vec; + type Polygon = Vec; + type MultiPolygon = Vec; + type LineString = Vec; + type MultiLineString = Vec; + + #[derive(Row, Serialize, Deserialize)] + struct MyRow { point: Point, ring: Ring, polygon: Polygon, multi_polygon: MultiPolygon, line_string: LineString, multi_line_string: MultiLineString, -} -``` + } + ``` * `Variant`, `Dynamic`, (new) `JSON` data types aren't supported yet. @@ -508,7 +508,7 @@ See [the example](https://github.com/ClickHouse/clickhouse-rs/tree/main/examples ### CANNOT_READ_ALL_DATA {#cannot_read_all_data} -The most common cause for the `CANNOT_READ_ALL_DATA` error is that the row definition on the application side does match that in ClickHouse. +The most common cause for the `CANNOT_READ_ALL_DATA` error is that the row definition on the application side does match that in ClickHouse. Consider the following table: diff --git a/docs/integrations/migration/clickhouse-local-etl.md b/docs/integrations/migration/clickhouse-local-etl.md index c1a23ef22b1..d102651cff8 100644 --- a/docs/integrations/migration/clickhouse-local-etl.md +++ b/docs/integrations/migration/clickhouse-local-etl.md @@ -49,44 +49,37 @@ On that host machine, download the appropriate build of `clickhouse-local` based - 1. The simplest way to download `clickhouse-local` locally is to run the following command: - ```bash - curl https://clickhouse.com/ | sh - ``` - +```bash +curl https://clickhouse.com/ | sh +``` 1. Run `clickhouse-local` (it will just print its version): - ```bash - ./clickhouse-local - ``` - +```bash +./clickhouse-local +``` - 1. The simplest way to download `clickhouse-local` locally is to run the following command: - ```bash - curl https://clickhouse.com/ | sh - ``` - +```bash +curl https://clickhouse.com/ | sh +``` 1. Run `clickhouse-local` (it will just print its version): - ```bash - ./clickhouse local - ``` - +```bash +./clickhouse local +``` -:::info Important -The examples throughout this guide use the Linux commands for running `clickhouse-local` (`./clickhouse-local`). -To run `clickhouse-local` on a Mac, use `./clickhouse local`. -::: + :::info Important + The examples throughout this guide use the Linux commands for running `clickhouse-local` (`./clickhouse-local`). + To run `clickhouse-local` on a Mac, use `./clickhouse local`. + ::: + :::tip Add the remote system to your ClickHouse Cloud service IP Access List + In order for the `remoteSecure` function to connect to your ClickHouse Cloud service, the IP address of the remote system needs to be allowed by the IP Access List. Expand **Manage your IP Access List** below this tip for more information. + ::: -:::tip Add the remote system to your ClickHouse Cloud service IP Access List -In order for the `remoteSecure` function to connect to your ClickHouse Cloud service, the IP address of the remote system needs to be allowed by the IP Access List. Expand **Manage your IP Access List** below this tip for more information. -::: - - + ## Example 1: Migrating from MySQL to ClickHouse Cloud with an Integration engine {#example-1-migrating-from-mysql-to-clickhouse-cloud-with-an-integration-engine} @@ -129,7 +122,6 @@ No data is stored locally on the `clickhouse-local` host machine. Instead, the d and then immediately written to the destination table on the ClickHouse Cloud service. ::: - ## Example 2: Migrating from MySQL to ClickHouse Cloud with the JDBC bridge {#example-2-migrating-from-mysql-to-clickhouse-cloud-with-the-jdbc-bridge} We will use the [JDBC integration table engine](/engines/table-engines/integrations/jdbc.md) (created on-the-fly by the [jdbc table function](/sql-reference/table-functions/jdbc.md)) together with the [ClickHouse JDBC Bridge](https://github.com/ClickHouse/clickhouse-jdbc-bridge) and the MySQL JDBC driver for reading data from the source MySQL database and we will use the [remoteSecure table function](/sql-reference/table-functions/remote.md) diff --git a/docs/integrations/migration/clickhouse-to-cloud.md b/docs/integrations/migration/clickhouse-to-cloud.md index 76902215882..4751a613f43 100644 --- a/docs/integrations/migration/clickhouse-to-cloud.md +++ b/docs/integrations/migration/clickhouse-to-cloud.md @@ -42,78 +42,74 @@ The process is: 1. Remove the source server from the IP Access List on the destination (if applicable) 1. Remove the read-only user from the source service - ### Migration of tables from one system to another: {#migration-of-tables-from-one-system-to-another} This example migrates one table from a self-managed ClickHouse server to ClickHouse Cloud. ### On the source ClickHouse system (the system that currently hosts the data) {#on-the-source-clickhouse-system-the-system-that-currently-hosts-the-data} - Add a read only user that can read the source table (`db.table` in this example) -```sql -CREATE USER exporter -IDENTIFIED WITH SHA256_PASSWORD BY 'password-here' -SETTINGS readonly = 1; -``` + ```sql + CREATE USER exporter + IDENTIFIED WITH SHA256_PASSWORD BY 'password-here' + SETTINGS readonly = 1; + ``` -```sql -GRANT SELECT ON db.table TO exporter; -``` + ```sql + GRANT SELECT ON db.table TO exporter; + ``` - Copy the table definition -```sql -SELECT create_table_query -FROM system.tables -WHERE database = 'db' AND table = 'table' -``` + ```sql + SELECT create_table_query + FROM system.tables + WHERE database = 'db' AND table = 'table' + ``` ### On the destination ClickHouse Cloud system: {#on-the-destination-clickhouse-cloud-system} - Create the destination database: -```sql -CREATE DATABASE db -``` + ```sql + CREATE DATABASE db + ``` - Using the CREATE TABLE statement from the source, create the destination. -:::tip -Change the ENGINE to to ReplicatedMergeTree without any parameters when you run the CREATE statement. ClickHouse Cloud always replicates tables and provides the correct parameters. Keep the `ORDER BY`, `PRIMARY KEY`, `PARTITION BY`, `SAMPLE BY`, `TTL`, and `SETTINGS` clauses though. -::: - -```sql -CREATE TABLE db.table ... -``` + :::tip + Change the ENGINE to to ReplicatedMergeTree without any parameters when you run the CREATE statement. ClickHouse Cloud always replicates tables and provides the correct parameters. Keep the `ORDER BY`, `PRIMARY KEY`, `PARTITION BY`, `SAMPLE BY`, `TTL`, and `SETTINGS` clauses though. + ::: + ```sql + CREATE TABLE db.table ... + ``` - Use the `remoteSecure` function to pull the data from the self-managed source - + -```sql -INSERT INTO db.table SELECT * FROM -remoteSecure('source-hostname', db, table, 'exporter', 'password-here') -``` + ```sql + INSERT INTO db.table SELECT * FROM + remoteSecure('source-hostname', db, table, 'exporter', 'password-here') + ``` -:::note -If the source system is not available from outside networks then you can push the data rather than pulling it, as the `remoteSecure` function works for both selects and inserts. See the next option. -::: + :::note + If the source system is not available from outside networks then you can push the data rather than pulling it, as the `remoteSecure` function works for both selects and inserts. See the next option. + ::: - Use the `remoteSecure` function to push the data to the ClickHouse Cloud service - + -:::tip Add the remote system to your ClickHouse Cloud service IP Access List -In order for the `remoteSecure` function to connect to your ClickHouse Cloud service the IP Address of the remote system will need to be allowed by the IP Access List. Expand **Manage your IP Access List** below this tip for more information. -::: - - - -```sql -INSERT INTO FUNCTION -remoteSecure('HOSTNAME.clickhouse.cloud:9440', 'db.table', -'default', 'PASS') SELECT * FROM db.table -``` + :::tip Add the remote system to your ClickHouse Cloud service IP Access List + In order for the `remoteSecure` function to connect to your ClickHouse Cloud service the IP Address of the remote system will need to be allowed by the IP Access List. Expand **Manage your IP Access List** below this tip for more information. + ::: + + ```sql + INSERT INTO FUNCTION + remoteSecure('HOSTNAME.clickhouse.cloud:9440', 'db.table', + 'default', 'PASS') SELECT * FROM db.table + ``` ## Migrating between ClickHouse Cloud services {#migrating-between-clickhouse-cloud-services} @@ -123,11 +119,11 @@ Some example uses for migrating data between ClickHouse Cloud services: - Migrating data from a restored backup - Copying data from a development service to a staging service (or staging to production) -In this example there are two ClickHouse Cloud services, and they will be referred to as *source* and *destination*. The data will be pulled from the source to the destination. Although you could push if you like, pulling is shown as it uses a read-only user. + In this example there are two ClickHouse Cloud services, and they will be referred to as *source* and *destination*. The data will be pulled from the source to the destination. Although you could push if you like, pulling is shown as it uses a read-only user. - + -There are a few steps in the migration: + There are a few steps in the migration: 1. Identify one ClickHouse Cloud service to be the *source*, and the other as the *destination* 1. Add a read-only user to the source service 1. Duplicate the source table structure on the destination service @@ -136,45 +132,42 @@ There are a few steps in the migration: 1. Re-establish the IP Access List on the destination 1. Remove the read-only user from the source service - #### Add a read-only user to the source service {#add-a-read-only-user-to-the-source-service} - Add a read only user that can read the source table (`db.table` in this example) - ```sql - CREATE USER exporter - IDENTIFIED WITH SHA256_PASSWORD BY 'password-here' - SETTINGS readonly = 1; - ``` + ```sql + CREATE USER exporter + IDENTIFIED WITH SHA256_PASSWORD BY 'password-here' + SETTINGS readonly = 1; + ``` - ```sql - GRANT SELECT ON db.table TO exporter; - ``` + ```sql + GRANT SELECT ON db.table TO exporter; + ``` - Copy the table definition - ```sql - select create_table_query - from system.tables - where database = 'db' and table = 'table' - ``` + ```sql + select create_table_query + from system.tables + where database = 'db' and table = 'table' + ``` #### Duplicate the table structure on the destination service {#duplicate-the-table-structure-on-the-destination-service} On the destination create the database if it is not there already: - Create the destination database: - ```sql - CREATE DATABASE db - ``` - - + ```sql + CREATE DATABASE db + ``` - Using the CREATE TABLE statement from the source, create the destination. - On the destination create the table using the output of the `select create_table_query...` from the source: + On the destination create the table using the output of the `select create_table_query...` from the source: - ```sql - CREATE TABLE db.table ... - ``` + ```sql + CREATE TABLE db.table ... + ``` #### Allow remote access to the source service {#allow-remote-access-to-the-source-service} @@ -189,12 +182,12 @@ Modify the allow list and allow access from **Anywhere** temporarily. See the [I #### Copy the data from source to destination {#copy-the-data-from-source-to-destination} - Use the `remoteSecure` function to pull the data from the source ClickHouse Cloud service - Connect to the destination. Run this command on the destination ClickHouse Cloud service: + Connect to the destination. Run this command on the destination ClickHouse Cloud service: - ```sql - INSERT INTO db.table SELECT * FROM - remoteSecure('source-hostname', db, table, 'exporter', 'password-here') - ``` + ```sql + INSERT INTO db.table SELECT * FROM + remoteSecure('source-hostname', db, table, 'exporter', 'password-here') + ``` - Verify the data in the destination service diff --git a/docs/integrations/migration/etl-tool-to-clickhouse.md b/docs/integrations/migration/etl-tool-to-clickhouse.md index f66e6ff2c47..85cf9322339 100644 --- a/docs/integrations/migration/etl-tool-to-clickhouse.md +++ b/docs/integrations/migration/etl-tool-to-clickhouse.md @@ -20,4 +20,4 @@ A great option for moving data from an external data source into ClickHouse is t - [dbt](/integrations/data-ingestion/etl-tools/dbt/index.md) - [Vector](/integrations/data-ingestion/etl-tools/vector-to-clickhouse.md) -But there are many other ETL/ELT tools that integrate with ClickHouse, so check your favorite tool's documentation for details. + But there are many other ETL/ELT tools that integrate with ClickHouse, so check your favorite tool's documentation for details. diff --git a/docs/integrations/migration/object-storage-to-clickhouse.md b/docs/integrations/migration/object-storage-to-clickhouse.md index 2f323db04ef..dc652cf7758 100644 --- a/docs/integrations/migration/object-storage-to-clickhouse.md +++ b/docs/integrations/migration/object-storage-to-clickhouse.md @@ -20,11 +20,11 @@ table functions for migrating data stored in Cloud Object Storage into a ClickHo - [gcs](/sql-reference/table-functions/gcs) - [azureBlobStorage](/sql-reference/table-functions/azureBlobStorage) -If your current database system is not able to directly offload data into a Cloud Object Storage, you could use a [third-party ETL/ELT tool](./etl-tool-to-clickhouse.md) or [clickhouse-local](./clickhouse-local-etl.md) for moving data -from you current database system to Cloud Object Storage, in order to migrate that data in a second step into a ClickHouse Cloud table. + If your current database system is not able to directly offload data into a Cloud Object Storage, you could use a [third-party ETL/ELT tool](./etl-tool-to-clickhouse.md) or [clickhouse-local](./clickhouse-local-etl.md) for moving data + from you current database system to Cloud Object Storage, in order to migrate that data in a second step into a ClickHouse Cloud table. -Although this is a two steps process (offload data into a Cloud Object Storage, then load into ClickHouse), the advantage is that this -scales to petabytes thanks to a [solid ClickHouse Cloud](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) support of highly-parallel reads from Cloud Object Storage. -Also you can leverage sophisticated and compressed formats like [Parquet](/interfaces/formats/#data-format-parquet). + Although this is a two steps process (offload data into a Cloud Object Storage, then load into ClickHouse), the advantage is that this + scales to petabytes thanks to a [solid ClickHouse Cloud](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) support of highly-parallel reads from Cloud Object Storage. + Also you can leverage sophisticated and compressed formats like [Parquet](/interfaces/formats/#data-format-parquet). -There is a [blog article](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) with concrete code examples showing how you can get data into ClickHouse Cloud using S3. + There is a [blog article](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) with concrete code examples showing how you can get data into ClickHouse Cloud using S3. diff --git a/docs/integrations/migration/overview.md b/docs/integrations/migration/overview.md index 46457d3c294..ebf20cd9fd7 100644 --- a/docs/integrations/migration/overview.md +++ b/docs/integrations/migration/overview.md @@ -30,4 +30,4 @@ There are several options for migrating data into ClickHouse Cloud, depending on - [Anywhere!](./etl-tool-to-clickhouse.md): use one of the many popular ETL/ELT tools that connect to all kinds of different data sources - [Object Storage](./object-storage-to-clickhouse.md): easily insert data from S3 into ClickHouse -In the example [Migrate from Redshift](/integrations/data-ingestion/redshift/index.md), we present three different ways to migrate data to ClickHouse. + In the example [Migrate from Redshift](/integrations/data-ingestion/redshift/index.md), we present three different ways to migrate data to ClickHouse. diff --git a/docs/integrations/prometheus.md b/docs/integrations/prometheus.md index 76dab32921c..38954919ded 100644 --- a/docs/integrations/prometheus.md +++ b/docs/integrations/prometheus.md @@ -14,7 +14,6 @@ import prometheus_grafana_metrics_explorer from '@site/static/images/integration import prometheus_datadog from '@site/static/images/integrations/prometheus-datadog.png'; import Image from '@theme/IdealImage'; - # Prometheus Integration The feature supports integrating [Prometheus](https://prometheus.io/) to monitor ClickHouse Cloud services. Access to Prometheus metrics is exposed via the [ClickHouse Cloud API](/cloud/manage/api/api-overview) endpoint that allows users to securely connect and export metrics into their Prometheus metrics collector. These metrics can be integrated with dashboards e.g., Grafana, Datadog for visualization. @@ -38,7 +37,6 @@ To get started, [generate an API key](/cloud/manage/openapi). | Service ID | Endpoint address | uuid (optional) | | filtered_metrics | Query param | boolean (optional) | - ### Authentication {#authentication} Use your ClickHouse Cloud API key for basic authentication: @@ -169,20 +167,20 @@ global: scrape_configs: - job_name: "prometheus" - static_configs: - - targets: ["localhost:9090"] + static_configs: + - targets: ["localhost:9090"] - job_name: "clickhouse" - static_configs: + static_configs: - targets: ["api.clickhouse.cloud"] - scheme: https - params: + scheme: https + params: filtered_metrics: ["true"] - metrics_path: "/v1/organizations//prometheus" - basic_auth: + metrics_path: "/v1/organizations//prometheus" + basic_auth: username: password: - honor_labels: true -``` + honor_labels: true + ``` Note the `honor_labels` configuration parameter needs to be set to `true` for the instance label to be properly populated. Additionally, `filtered_metrics` is set to `true` in the above example, but should be configured based on user preference. @@ -193,7 +191,7 @@ Users have two primary ways to integrate with Grafana: - **Metrics Endpoint** – This approach has the advantage of not requiring any additional components or infrastructure. This offering is limited to Grafana Cloud and only requires the ClickHouse Cloud Prometheus Endpoint URL and credentials. - **Grafana Alloy** - Grafana Alloy is a vendor-neutral distribution of the OpenTelemetry (OTel) Collector, replacing the Grafana Agent. This can be used as a scraper, is deployable in your own infrastructure, and is compatible with any Prometheus endpoint. -We provide instructions on using these options below, focusing on the details specific to the ClickHouse Cloud Prometheus Endpoint. + We provide instructions on using these options below, focusing on the details specific to the ClickHouse Cloud Prometheus Endpoint. ### Grafana Cloud with metrics endpoint {#grafana-cloud-with-metrics-endpoint} @@ -202,17 +200,13 @@ We provide instructions on using these options below, focusing on the details sp - Configure the Scrape URL to point to the Prometheus endpoint and use basic auth to configure your connection with the API key/secret - Test the connection to ensure you are able to connect - - -
    - -Once configured, you should see the metrics in the drop-down that you can select to configure dashboards: + - + Once configured, you should see the metrics in the drop-down that you can select to configure dashboards: -
    + - + ### Grafana Cloud with Alloy {#grafana-cloud-with-alloy} @@ -299,13 +293,11 @@ init_config: instances: - openmetrics_endpoint: 'https://api.clickhouse.cloud/v1/organizations/97a33bdb-4db3-4067-b14f-ce40f621aae1/prometheus?filtered_metrics=true' - namespace: 'clickhouse' - metrics: + namespace: 'clickhouse' + metrics: - '^ClickHouse.*' - username: username - password: password -``` - -
    + username: username + password: password + ``` diff --git a/docs/integrations/sql-clients/datagrip.md b/docs/integrations/sql-clients/datagrip.md index 11f33769bff..a93edbe9b5b 100644 --- a/docs/integrations/sql-clients/datagrip.md +++ b/docs/integrations/sql-clients/datagrip.md @@ -28,36 +28,36 @@ DataGrip is available at https://www.jetbrains.com/datagrip/ 1. Launch DataGrip, and on the **Data Sources** tab in the **Data Sources and Drivers** dialog, click the **+** icon - + - Select **ClickHouse** + Select **ClickHouse** - :::tip - As you establish connections the order changes, ClickHouse may not be at the top of your list yet. - ::: + :::tip + As you establish connections the order changes, ClickHouse may not be at the top of your list yet. + ::: - + - Switch to the **Drivers** tab and load the ClickHouse driver - DataGrip does not ship with drivers in order to minimize the download size. On the **Drivers** tab - Select **ClickHouse** from the **Complete Support** list, and expand the **+** sign. Choose the **Latest stable** driver from the **Provided Driver** option: + DataGrip does not ship with drivers in order to minimize the download size. On the **Drivers** tab + Select **ClickHouse** from the **Complete Support** list, and expand the **+** sign. Choose the **Latest stable** driver from the **Provided Driver** option: - + ## 3. Connect to ClickHouse {#3-connect-to-clickhouse} - Specify your database connection details, and click **Test Connection**: - In step one you gathered your connection details, fill in the host URL, port, username, password, and database name, then test the connection. + In step one you gathered your connection details, fill in the host URL, port, username, password, and database name, then test the connection. - :::tip - The **HOST** entry in the DataGrip dialog is actually a URL, see the image below. + :::tip + The **HOST** entry in the DataGrip dialog is actually a URL, see the image below. - For more details on JDBC URL settings, please refer to the [ClickHouse JDBC driver](https://github.com/ClickHouse/clickhouse-java) repository. - ::: + For more details on JDBC URL settings, please refer to the [ClickHouse JDBC driver](https://github.com/ClickHouse/clickhouse-java) repository. + ::: - + ## Learn more {#learn-more} diff --git a/docs/integrations/sql-clients/dbeaver.md b/docs/integrations/sql-clients/dbeaver.md index 8e5b5bfee6e..138d01a43d2 100644 --- a/docs/integrations/sql-clients/dbeaver.md +++ b/docs/integrations/sql-clients/dbeaver.md @@ -42,29 +42,29 @@ DBeaver is available at https://dbeaver.io/download/ - Either use the **Database > New Database Connection** menu or the **New Database Connection** icon in the **Database Navigator** to bring up the **Connect to a database** dialog: - + - Select **Analytical** and then **ClickHouse**: - Build the JDBC URL. On the **Main** tab set the Host, Port, Username, Password, and Database: - + - By default the **SSL > Use SSL** property will be unset, if you are connecting to ClickHouse Cloud or a server that requires SSL on the HTTP port, then set **SSL > Use SSL** on: - + - Test the connection: - + -If DBeaver detects that you do not have the ClickHouse driver installed it will offer to download them for you: + If DBeaver detects that you do not have the ClickHouse driver installed it will offer to download them for you: - + - After downloading the driver **Test** the connection again: - + ## 4. Query ClickHouse {#4-query-clickhouse} @@ -72,11 +72,11 @@ Open a query editor and run a query. - Right click on your connection and choose **SQL Editor > Open SQL Script** to open a query editor: - + - An example query against `system.query_log`: - + ## Next steps {#next-steps} diff --git a/docs/integrations/sql-clients/dbvisualizer.md b/docs/integrations/sql-clients/dbvisualizer.md index 78901286134..24db70328ce 100644 --- a/docs/integrations/sql-clients/dbvisualizer.md +++ b/docs/integrations/sql-clients/dbvisualizer.md @@ -48,8 +48,8 @@ To connect a database with DbVisualizer, you must first create and setup a Datab 8. If the result from Ping Server shows that the server can be reached, click **Connect** to connect to the database server. -:::tip -See [Fixing Connection Issues](https://confluence.dbvis.com/display/UG231/Fixing+Connection+Issues) for some tips if you have problems connecting to the database. + :::tip + See [Fixing Connection Issues](https://confluence.dbvis.com/display/UG231/Fixing+Connection+Issues) for some tips if you have problems connecting to the database. ## Learn more {#learn-more} diff --git a/docs/integrations/sql-clients/jupysql.md b/docs/integrations/sql-clients/jupysql.md index ad909806e22..1f7172bb05f 100644 --- a/docs/integrations/sql-clients/jupysql.md +++ b/docs/integrations/sql-clients/jupysql.md @@ -21,7 +21,6 @@ Once the data is loaded, we'll visualize it via SQL plotting. The integration between JupySQL and ClickHouse is made possible by the use of the clickhouse_sqlalchemy library. This library allows for easy communication between the two systems, and enables users to connect to ClickHouse and pass the SQL dialect. Once connected, users can run SQL queries directly from the Clickhouse native UI, or from the Jupyter notebook directly. - ```python # Install required packages %pip install --quiet jupysql clickhouse_sqlalchemy @@ -29,7 +28,6 @@ The integration between JupySQL and ClickHouse is made possible by the use of th Note: you may need to restart the kernel to use updated packages. - ```python import pandas as pd from sklearn_evaluation import plot @@ -43,12 +41,10 @@ from sklearn_evaluation import plot **Note:** you will need to adjust the connection string according to the instance type you're trying to connect to (url, user, password). In the example below we've used a local instance. To learn more about it, check out [this guide](/get-started/quick-start). - ```python %sql clickhouse://default:@localhost:8123/default ``` - ```sql %%sql CREATE TABLE trips @@ -104,21 +100,14 @@ PARTITION BY toYYYYMM(pickup_date) ORDER BY pickup_datetime; ``` - * clickhouse://default:***@localhost:8123/default - Done. - - - - + * clickhouse://default:***@localhost:8123/default + Done.
    - - - ```sql %%sql INSERT INTO trips @@ -173,31 +162,20 @@ SELECT * FROM s3( ") SETTINGS input_format_try_infer_datetimes = 0 ``` - * clickhouse://default:***@localhost:8123/default - Done. - - - - + * clickhouse://default:***@localhost:8123/default + Done.
    - - - ```python %sql SELECT count() FROM trips limit 5; ``` - * clickhouse://default:***@localhost:8123/default - Done. - - - - + * clickhouse://default:***@localhost:8123/default + Done. @@ -208,19 +186,12 @@ SELECT * FROM s3(
    - - - ```python %sql SELECT DISTINCT(pickup_ntaname) FROM trips limit 5; ``` - * clickhouse://default:***@localhost:8123/default - Done. - - - - + * clickhouse://default:***@localhost:8123/default + Done. @@ -243,19 +214,12 @@ SELECT * FROM s3(
    - - - ```python %sql SELECT round(avg(tip_amount), 2) FROM trips ``` - * clickhouse://default:***@localhost:8123/default - Done. - - - - + * clickhouse://default:***@localhost:8123/default + Done. @@ -266,9 +230,6 @@ SELECT * FROM s3(
    - - - ```sql %%sql SELECT @@ -278,12 +239,8 @@ FROM trips GROUP BY passenger_count ``` - * clickhouse://default:***@localhost:8123/default - Done. - - - - + * clickhouse://default:***@localhost:8123/default + Done. @@ -332,9 +289,6 @@ GROUP BY passenger_count
    - - - ```sql %%sql SELECT @@ -347,11 +301,10 @@ ORDER BY pickup_date ASC limit 5; ``` -* clickhouse://default:***@localhost:8123/default -Done. - +* clickhouse://default:***@localhost:8123/default + Done. - +
    @@ -382,7 +335,7 @@ Done. -
    pickup_date pickup_ntanameQueensbridge-Ravenswood-Long Island City 9
    + ```python # %sql DESCRIBE trips; @@ -399,8 +352,8 @@ FROM trips WHERE trip_distance < 6.3 ``` - * clickhouse://default:***@localhost:8123/default - Skipping execution... + * clickhouse://default:***@localhost:8123/default + Skipping execution... ```python %sqlplot histogram --table short-trips --column trip_distance --bins 10 --with short-trips @@ -411,7 +364,6 @@ WHERE trip_distance < 6.3 ``` Histogram showing distribution of trip distances with 10 bins from the short-trips dataset - ```python ax = %sqlplot histogram --table short-trips --column trip_distance --bins 50 --with short-trips ax.grid() diff --git a/docs/integrations/sql-clients/marimo.md b/docs/integrations/sql-clients/marimo.md index cff3817d3a8..83ae78b77a0 100644 --- a/docs/integrations/sql-clients/marimo.md +++ b/docs/integrations/sql-clients/marimo.md @@ -48,7 +48,7 @@ You will then have a cell that can be run to establish a connection. ## 3. Run SQL {#run-sql} -Once you have set up a connection, you can create a new SQL cell and choose the clickhouse engine. +Once you have set up a connection, you can create a new SQL cell and choose the clickhouse engine. Choose SQL engine diff --git a/docs/integrations/sql-clients/qstudio.md b/docs/integrations/sql-clients/qstudio.md index 4f0faa45fe1..3c1eba9e812 100644 --- a/docs/integrations/sql-clients/qstudio.md +++ b/docs/integrations/sql-clients/qstudio.md @@ -28,7 +28,7 @@ QStudio uses JDBC over HTTP(S) to connect to ClickHouse; you need: - username - password - + ## 2. Download QStudio {#2-download-qstudio} @@ -39,17 +39,17 @@ QStudio is available at https://www.timestored.com/qstudio/download/ - When you first open QStudio click on the menu options **Server->Add Server** or on the add server button on the toolbar. - Then set the details: -QStudio database connection configuration screen showing ClickHouse connection settings + QStudio database connection configuration screen showing ClickHouse connection settings -1. Server Type: Clickhouse.com -2. Note for Host you MUST include https:// +1. Server Type: Clickhouse.com +2. Note for Host you MUST include https:// Host: https://abc.def.clickhouse.cloud Port: 8443 -3. Username: default +3. Username: default Password: `XXXXXXXXXXX` - 4. Click Add + 4. Click Add -If QStudio detects that you do not have the ClickHouse JDBC driver installed, it will offer to download them for you: + If QStudio detects that you do not have the ClickHouse JDBC driver installed, it will offer to download them for you: ## 4. Query ClickHouse {#4-query-clickhouse} @@ -59,7 +59,7 @@ If QStudio detects that you do not have the ClickHouse JDBC driver installed, it - An example query: -QStudio interface showing sample SQL query execution against ClickHouse database + QStudio interface showing sample SQL query execution against ClickHouse database ## Next steps {#next-steps} diff --git a/docs/integrations/sql-clients/sql-console.md b/docs/integrations/sql-clients/sql-console.md index f4a7e2c78df..1297bd86f2d 100644 --- a/docs/integrations/sql-clients/sql-console.md +++ b/docs/integrations/sql-clients/sql-console.md @@ -127,7 +127,7 @@ There are two ways to create a new query in the SQL console. - Click the '+' button in the tab bar - Select the 'New Query' button from the left sidebar query list -Interface showing how to create a new query using the + button or New Query button + Interface showing how to create a new query using the + button or New Query button ### Running a query {#running-a-query} @@ -139,23 +139,23 @@ By default, clicking the run button will run all commands contained in the SQL E - Run selected command(s) - Run command at the cursor -To run selected command(s), highlight the desired command or sequence of commands and click the 'Run' button (or use the `cmd / ctrl + enter` shortcut). You can also select 'Run selected' from the SQL Editor context menu (opened by right-clicking anywhere within the editor) when a selection is present. + To run selected command(s), highlight the desired command or sequence of commands and click the 'Run' button (or use the `cmd / ctrl + enter` shortcut). You can also select 'Run selected' from the SQL Editor context menu (opened by right-clicking anywhere within the editor) when a selection is present. -Interface showing how to run a selected portion of SQL query + Interface showing how to run a selected portion of SQL query -Running the command at the current cursor position can be achieved in two ways: + Running the command at the current cursor position can be achieved in two ways: - Select 'At Cursor' from the extended run options menu (or use the corresponding `cmd / ctrl + shift + enter` keyboard shortcut -Run at cursor option in the extended run options menu + Run at cursor option in the extended run options menu - - Selecting 'Run at cursor' from the SQL Editor context menu + - Selecting 'Run at cursor' from the SQL Editor context menu -Run at cursor option in the SQL Editor context menu + Run at cursor option in the SQL Editor context menu -:::note -The command present at the cursor position will flash yellow on execution. -::: + :::note + The command present at the cursor position will flash yellow on execution. + ::: ### Canceling a query {#canceling-a-query} @@ -187,9 +187,9 @@ Let's import the UK Price Paid example dataset and use that to create some GenAI 1. Create a new query by clicking the _+_ icon. 1. Paste and run the following code: - ```sql - CREATE TABLE uk_price_paid - ( + ```sql + CREATE TABLE uk_price_paid + ( price UInt32, date Date, postcode1 LowCardinality(String), @@ -204,20 +204,20 @@ Let's import the UK Price Paid example dataset and use that to create some GenAI town LowCardinality(String), district LowCardinality(String), county LowCardinality(String) - ) - ENGINE = MergeTree - ORDER BY (postcode1, postcode2, addr1, addr2); - ``` + ) + ENGINE = MergeTree + ORDER BY (postcode1, postcode2, addr1, addr2); + ``` - This query should take around 1 second to complete. Once it's done, you should have an empty table called `uk_price_paid. + This query should take around 1 second to complete. Once it's done, you should have an empty table called `uk_price_paid. 1. Create a new query and paste the following query: - ```sql - INSERT INTO uk_price_paid - WITH + ```sql + INSERT INTO uk_price_paid + WITH splitByChar(' ', postcode) AS p - SELECT + SELECT toUInt32(price_string) AS price, parseDateTimeBestEffortUS(time) AS date, p[1] AS postcode1, @@ -232,7 +232,7 @@ Let's import the UK Price Paid example dataset and use that to create some GenAI town, district, county - FROM url( + FROM url( 'http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv', 'CSV', 'uuid_string String, @@ -251,10 +251,10 @@ Let's import the UK Price Paid example dataset and use that to create some GenAI county String, d String, e String' - ) SETTINGS max_http_get_redirects=10; - ``` + ) SETTINGS max_http_get_redirects=10; + ``` -This query grabs the dataset from the `gov.uk` website. This file is ~4GB, so this query will take a few minutes to complete. Once ClickHouse has processed the query, you should have the entire dataset within the `uk_price_paid` table. + This query grabs the dataset from the `gov.uk` website. This file is ~4GB, so this query will take a few minutes to complete. Once ClickHouse has processed the query, you should have the entire dataset within the `uk_price_paid` table. #### Query creation {#query-creation} @@ -264,16 +264,16 @@ Let's create a query using natural language. 1. Click **Generate SQL**. You may be asked to accept that your queries are sent to Chat-GPT. You must select **I agree** to continue. 1. You can now use this prompt to enter a natural language query and have ChatGPT convert it into an SQL query. In this example we're going to enter: - > Show me the total price and total number of all uk_price_paid transactions by year. + > Show me the total price and total number of all uk_price_paid transactions by year. 1. The console will generate the query we're looking for and display it in a new tab. In our example, GenAI created the following query: - ```sql - -- Show me the total price and total number of all uk_price_paid transactions by year. - SELECT year(date), sum(price) as total_price, Count(*) as total_transactions - FROM uk_price_paid - GROUP BY year(date) - ``` + ```sql + -- Show me the total price and total number of all uk_price_paid transactions by year. + SELECT year(date), sum(price) as total_price, Count(*) as total_transactions + FROM uk_price_paid + GROUP BY year(date) + ``` 1. Once you've verified that the query is correct, click **Run** to execute it. @@ -283,19 +283,19 @@ Now, let's test the query debugging capabilities of GenAI. 1. Create a new query by clicking the _+_ icon and paste the following code: - ```sql - -- Show me the total price and total number of all uk_price_paid transactions by year. - SELECT year(date), sum(pricee) as total_price, Count(*) as total_transactions - FROM uk_price_paid - GROUP BY year(date) - ``` + ```sql + -- Show me the total price and total number of all uk_price_paid transactions by year. + SELECT year(date), sum(pricee) as total_price, Count(*) as total_transactions + FROM uk_price_paid + GROUP BY year(date) + ``` 1. Click **Run**. The query fails since we're trying to get values from `pricee` instead of `price`. 1. Click **Fix Query**. 1. GenAI will attempt to fix the query. In this case, it changed `pricee` to `price`. It also realised that `toYear` is a better function to use in this scenario. 1. Select **Apply** to add the suggested changes to your query and click **Run**. -Keep in mind that GenAI is an experimental feature. Use caution when running GenAI-generated queries against any dataset. + Keep in mind that GenAI is an experimental feature. Use caution when running GenAI-generated queries against any dataset. ## Advanced querying features {#advanced-querying-features} @@ -377,13 +377,13 @@ A number of more advanced chart characteristics can also be adjusted in the 'Adv - Axis titles - Label orientation for the x-axis -Our chart will be updated accordingly: + Our chart will be updated accordingly: -Update subtitle etc. + Update subtitle etc. -In some scenarios, it may be necessary to adjust the axis scales for each field independently. This can also be accomplished in the 'Advanced' section of the chart configuration pane by specifying min and max values for an axis range. As an example, the above chart looks good, but in order to demonstrate the correlation between our `trip_total` and `fare_total` fields, the axis ranges need some adjustment: + In some scenarios, it may be necessary to adjust the axis scales for each field independently. This can also be accomplished in the 'Advanced' section of the chart configuration pane by specifying min and max values for an axis range. As an example, the above chart looks good, but in order to demonstrate the correlation between our `trip_total` and `fare_total` fields, the axis ranges need some adjustment: -Adjust axis scale + Adjust axis scale ## Sharing queries {#sharing-queries} @@ -404,4 +404,3 @@ A dialog will open, allowing you to share the query with all members of a team. In some scenarios, it may be necessary to adjust the axis scales for each field independently. This can also be accomplished in the 'Advanced' section of the chart configuration pane by specifying min and max values for an axis range. As an example, the above chart looks good, but in order to demonstrate the correlation between our `trip_total` and `fare_total` fields, the axis ranges need some adjustment: Shared with me section in the query list - diff --git a/docs/integrations/sql-clients/tablum.md b/docs/integrations/sql-clients/tablum.md index d2202dccc57..d6221d853ca 100644 --- a/docs/integrations/sql-clients/tablum.md +++ b/docs/integrations/sql-clients/tablum.md @@ -22,7 +22,6 @@ import CommunityMaintainedBadge from '@theme/badges/CommunityMaintained'; You can install a self-hosted version of TABLUM.IO on your Linux server in docker. ::: - ## 1. Sign up or sign in to the service {#1-sign-up-or-sign-in-to-the-service} First, sign up to TABLUM.IO using your email or use a quick-login via accounts in Google or Facebook. @@ -64,7 +63,7 @@ With TABLUM.IO you can * create and utilise multiple ClickHouse connectors within your TABLUM.IO account, * run queries on any loaded data regardless of the data source, * share the results as a new ClickHouse database. -::: + ::: ## Learn more {#learn-more} diff --git a/docs/intro.md b/docs/intro.md index 27679f8e1a3..8189736ff96 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -90,8 +90,6 @@ ClickHouse chooses the join algorithm adaptively, it starts with fast hash joins ClickHouse is well known for having extremely fast query performance. To learn why ClickHouse is so fast, see the [Why is ClickHouse fast?](/concepts/why-clickhouse-is-so-fast.md) guide. - - + Note that for CPU efficiency, the query language must be declarative (SQL or MDX), or at least a vector (J, K). The query should only contain implicit loops, allowing for optimization. + --> diff --git a/docs/introduction-index.md b/docs/introduction-index.md index 4acb45d44c3..0f01bcf179e 100644 --- a/docs/introduction-index.md +++ b/docs/introduction-index.md @@ -10,7 +10,7 @@ Welcome to ClickHouse! Check out the pages below to learn how to get up and runn | Page | Description | |------------------------------------------------|--------------------------------------------------------------------| | [What is ClickHouse?](about-us/intro.mdx) | Learn more about what ClickHouse is. | -| [Quick Start](/get-started/quick-start) | Quick start guide to get you up and running in no time. | +| [Quick Start](/get-started/quick-start) | Quick start guide to get you up and running in no time. | | [Advanced Tutorial](tutorial.md) | Comfortable with the basics? Let's do something more interesting. | | [Install](getting-started/install/install.mdx) | Learn about the various ways you can install ClickHouse. | -| [Deployment modes](deployment-modes.md) | This guide explores the four main ways to deploy and use ClickHouse.| +| [Deployment modes](deployment-modes.md) | This guide explores the four main ways to deploy and use ClickHouse.| diff --git a/docs/managing-data/core-concepts/academic_overview.mdx b/docs/managing-data/core-concepts/academic_overview.mdx index 1ea8d38e89d..2af60544a38 100644 --- a/docs/managing-data/core-concepts/academic_overview.mdx +++ b/docs/managing-data/core-concepts/academic_overview.mdx @@ -54,9 +54,9 @@ ClickHouse is designed to address five key challenges of modern analytical data 5. **Industry-grade robustness and versatile deployment**. As commodity hardware is unreliable, databases must provide data replication for robustness against node failures. Also, databases should run on any hardware, from old laptops to powerful servers. Finally, to avoid the overhead of garbage collection in JVM-based programs and enable bare-metal performance (e.g. SIMD), databases are ideally deployed as native binaries for the target platform. -Image 01 + Image 01 -Figure 1: ClickHouse timeline. + Figure 1: ClickHouse timeline. ## 2 ARCHITECTURE {#2-architecture} @@ -82,7 +82,6 @@ Finally, the ClickHouse database engine can be operated in on-premise, cloud, st This section discusses MergeTree* table engines as ClickHouse's native storage format. We describe their on-disk representation and discuss three data pruning techniques in ClickHouse. Afterwards, we present merge strategies which continuously transform data without impacting simultaneous inserts. Finally, we explain how updates and deletes are implemented, as well as data deduplication, data replication, and ACID compliance. - ### 3.1 On-Disk Format {#3-1-on-disk-format} Each table in the MergeTree* table engine is organized as a collection of immutable table parts. A part is created whenever a set of rows is inserted into the table. Parts are self-contained in the sense that they include all metadata required to interpret their content without additional lookups to a central catalog. To keep the number of parts per table low, a background merge job periodically combines multiple smaller parts into a larger part until a configurable part size is reached (150 GB by default). Since parts are sorted by the table's primary key columns (see Section [3.2)](#page-3-0), efficient k-way merge sort [\[40\]](#page-12-5) is used for merging. The source parts are marked as inactive and eventually deleted as soon as their reference count drops to zero, i.e. no further queries read from them. @@ -180,10 +179,8 @@ To maximize the performance of concurrent read and write operations, ClickHouse In practice, most of ClickHouse's write-heavy decision making use cases even tolerate a small risk of losing new data in case of a power outage. The database takes advantage of this by not forcing a commit (fsync) of newly inserted parts to disk by default, allowing the kernel to batch writes at the cost of forgoing atomicity. - ## 4 QUERY PROCESSING LAYER {#4-query-processing-layer} - Image 07 Figure 7: Parallelization across SIMD units, cores and nodes. @@ -243,13 +240,13 @@ This section presents selected key performance optimizations applied to differen - instant clearing of hash tables for reuse using per-hash-map and per-cell version counters, - usage of CPU prefetch (__builtin_prefetch) to speed up the retrieval of values after hashing the key. -**Joins**. As ClickHouse originally supported joins only rudimentarily, many use cases historically resorted to denormalized tables. Today, the database [offers](https://clickhou.se/joins) all join types available in SQL (inner, left- /right/full outer, cross, as-of), as well as different join algorithms such as hash join (naïve, grace), sort-merge join, and index join for table engines with fast key-value lookup (usually dictionaries). + **Joins**. As ClickHouse originally supported joins only rudimentarily, many use cases historically resorted to denormalized tables. Today, the database [offers](https://clickhou.se/joins) all join types available in SQL (inner, left- /right/full outer, cross, as-of), as well as different join algorithms such as hash join (naïve, grace), sort-merge join, and index join for table engines with fast key-value lookup (usually dictionaries). -Since joins are among the most expensive database operations, it is important to provide parallel variants of the classic join algorithms, ideally with configurable space/time trade-ofs. For hash joins, ClickHouse implements the non-blocking, shared partition algorithm from [\[7\]](#page-12-23). For example, the query in [Figure 9](#page-8-3) computes how users move between URLs via a self-join on a page hit statistics table. The build phase of the join is split into three lanes, covering three disjoint ranges of the source table. Instead of a global hash table, a partitioned hash table is used. The (typically three) worker threads determine the target partition for each input row of the build side by computing the modulo of a hash function. Access to the hash table partitions is synchronized using Gather exchange operators. The probe phase finds the target partition of its input tuples similarly. While this algorithm introduces two additional hash calculations per tuple, it greatly reduces latch contention in the build phase, depending on the number of hash table partitions. + Since joins are among the most expensive database operations, it is important to provide parallel variants of the classic join algorithms, ideally with configurable space/time trade-ofs. For hash joins, ClickHouse implements the non-blocking, shared partition algorithm from [\[7\]](#page-12-23). For example, the query in [Figure 9](#page-8-3) computes how users move between URLs via a self-join on a page hit statistics table. The build phase of the join is split into three lanes, covering three disjoint ranges of the source table. Instead of a global hash table, a partitioned hash table is used. The (typically three) worker threads determine the target partition for each input row of the build side by computing the modulo of a hash function. Access to the hash table partitions is synchronized using Gather exchange operators. The probe phase finds the target partition of its input tuples similarly. While this algorithm introduces two additional hash calculations per tuple, it greatly reduces latch contention in the build phase, depending on the number of hash table partitions. -Image 09 + Image 09 -Figure 9: Parallel hash join with three hash table partitions. + Figure 9: Parallel hash join with three hash table partitions. ### 4.5 Workload Isolation {#4-5-workload-isolation} @@ -283,7 +280,6 @@ Second, integration **database engines** map all tables of a table schema in a r Third, **dictionaries** can be populated using arbitrary queries against almost all possible data sources with a corresponding integration table function or engine. The runtime behavior is active since data is pulled in constant intervals from remote storage. - Data Formats. To interact with 3rd party systems, modern analytical databases must also be able to process data in any format. Besides its native format, ClickHouse supports [90+](https://clickhou.se/query-formats) formats, including CSV, JSON, Parquet, Avro, ORC, Arrow, and Protobuf. Each format can be an input format (which ClickHouse can read), an output format (which ClickHouse can export), or both. Some analytics-oriented formats like Parquet are also integrated with query processing, i.e, the optimizer can exploit embedded statistics, and filters are evaluated directly on compressed data. Compatibility interfaces. Besides its native binary wire protocol and HTTP, clients can interact with ClickHouse over MySQL or PostgreSQL wire-protocol-compatible interfaces. This compatibility feature is useful to enable access from proprietary applications (e.g. certain business intelligence tools), where vendors have not yet implemented native ClickHouse connectivity. @@ -306,7 +302,6 @@ A wide range of tools is available to investigate performance bottlenecks in ind ### 6.2 Benchmarks {#6-2-benchmarks} - While benchmarking has been criticized for being not realistic enough [\[10,](#page-12-27) [52,](#page-13-22) [66,](#page-13-23) [74\]](#page-13-24), it is still useful to identify the strengths and weaknesses of databases. In the following, we discuss how benchmarks are used to evaluate the performance of ClickHouse. #### 6.2.1 Denormalized Tables {#6-2-1-denormalized-tables} @@ -319,7 +314,6 @@ Filter and aggregation queries on denormalized fact tables historically represen Figure 10: Relative cold and hot runtimes of ClickBench. - To track the performance of SELECTs in more diverse workloads over time, we [use](https://clickhou.se/performance-over-years) a combination of four benchmarks called VersionsBench [\[19\]](#page-12-30). This benchmark is executed once per month when a new release is published to assess its performance [\[20\]](#page-12-31) and identify code changes that potentially degraded performance: Individual benchmarks include: 1. ClickBench (described above), 2. 15 MgBench [\[21\]](#page-12-32) queries, 3. 13 queries against a denormalized Star Schema Benchmark [\[57\]](#page-13-26) fact table with 600 million rows. 4. 4 queries against [NYC Taxi Rides](https://clickhou.se/nyc-taxi-rides-benchmark) with 3.4 billion rows [\[70\]](#page-13-27). [Figure 11](#page-10-5) shows the development of the VersionsBench runtimes for 77 ClickHouse versions between March 2018 and March 2024. To compensate for differences in the relative runtime of individual queries, we normalize the runtimes using a geometric mean with the ratio to the minimum query runtime across all versions as weight. The performance of VersionBench improved by 1.72 × over the past six years. Dates for releases with long-term support (LTS) are marked on the x-axis. Although performance deteriorated temporarily in some periods, LTS releases generally have comparable or better performance than the previous LTS version. The significant improvement in August 2022 was caused by the column-by-column filter evaluation technique described in Section [4.4.](#page-7-0) @@ -328,7 +322,6 @@ To track the performance of SELECTs in more diverse workloads over time, we [use Figure 11: Relative hot runtimes of VersionsBench 2018-2024. - #### 6.2.2 Normalized tables {#6-2-2-normalized-tables} In classical warehousing, data is often modeled using star or snowfake schemas. We present runtimes of TPC-H queries (scale factor 100) but remark that normalized tables are an emerging use case for ClickHouse. [Figure 12](#page-10-6) shows the hot runtimes of the TPC-H queries based on the parallel hash join algorithm described in Section [4.4.](#page-7-0) The measurements were taken on a single-node AWS EC2 c6i.16xlarge instance with 64 vCPUs, 128 GB RAM, and 5000 IOPS / 1000 MiB/s disk. The fastest of fve runs was recorded. For reference, we performed the same measurements in a Snowfake system of comparable size (warehouse size L, 8x8 vCPUs, 8x16 GB RAM). The results of eleven queries are excluded from the table: Queries Q2, Q4, Q13, Q17, and Q20-22 include correlated subqueries which are not supported as of ClickHouse v24.6. Queries Q7-Q9 and Q19 depend on extended plan-level optimizations for joins such as join reordering and join predicate pushdown (both missing as of ClickHouse v24.6.) to achieve viable runtimes. Automatic subquery decorrelation and better optimizer support for joins are planned for implementation in 2024 [\[18\]](#page-12-33). Out of the remaining 11 queries, 5 (6) queries executed faster in ClickHouse (Snowfake). As aforementioned optimizations are known to be critical for performance [\[27\]](#page-12-34), we expect them to improve runtimes of these queries further once implemented. diff --git a/docs/managing-data/core-concepts/merges.md b/docs/managing-data/core-concepts/merges.md index c58de7a4f1e..7f1ca444426 100644 --- a/docs/managing-data/core-concepts/merges.md +++ b/docs/managing-data/core-concepts/merges.md @@ -15,7 +15,6 @@ import merges_07 from '@site/static/images/managing-data/core-concepts/merges_07 import merges_dashboard from '@site/static/images/managing-data/core-concepts/merges-dashboard.gif'; import Image from '@theme/IdealImage'; - ## What are part merges in ClickHouse? {#what-are-part-merges-in-clickhouse}
    @@ -41,6 +40,7 @@ The `merge level` of a part is incremented by one with each additional merge. A ## Monitoring merges {#monitoring-merges} In the [what are table parts](/parts) example, we [showed](/parts#monitoring-table-parts) that ClickHouse tracks all table parts in the [parts](/operations/system-tables/parts) system table. We used the following query to retrieve the merge level and the number of stored rows per active part of the example table: + ```sql SELECT name, @@ -52,6 +52,7 @@ ORDER BY name ASC; ``` The [previously documented](/parts#monitoring-table-parts) query result shows that the example table had four active parts, each created from a single merge of the initially inserted parts: + ```response ┌─name────────┬─level─┬────rows─┐ 1. │ all_0_5_1 │ 1 │ 6368414 │ @@ -64,17 +65,15 @@ The [previously documented](/parts#monitoring-table-parts) query result shows th [Running](https://sql.clickhouse.com/?query=U0VMRUNUCiAgICBuYW1lLAogICAgbGV2ZWwsCiAgICByb3dzCkZST00gc3lzdGVtLnBhcnRzCldIRVJFIChkYXRhYmFzZSA9ICd1aycpIEFORCAoYHRhYmxlYCA9ICd1a19wcmljZV9wYWlkX3NpbXBsZScpIEFORCBhY3RpdmUKT1JERVIgQlkgbmFtZSBBU0M7&run_query=true&tab=results) the query now shows that the four parts have since merged into a single final part (as long as there are no further inserts into the table): ```response - ┌─name───────┬─level─┬─────rows─┐ +┌─name───────┬─level─┬─────rows─┐ 1. │ all_0_23_2 │ 2 │ 25248433 │ - └────────────┴───────┴──────────┘ +└────────────┴───────┴──────────┘ ``` In ClickHouse 24.10, a new [merges dashboard](https://presentations.clickhouse.com/2024-release-24.10/index.html#17) was added to the built-in [monitoring dashboards](https://clickhouse.com/blog/common-issues-you-can-solve-using-advanced-monitoring-dashboards). Available in both OSS and Cloud via the `/merges` HTTP handler, we can use it to visualize all part merges for our example table: PART MERGES -
    - The recorded dashboard above captures the entire process, from the initial data inserts to the final merge into a single part: ① Number of active parts. @@ -131,7 +130,6 @@ The mechanics of step ② depend on the specific [MergeTree engine](/engines/tab Next, we will briefly outline the merge mechanics of specific engines in the MergeTree family. - ### Standard merges {#standard-merges} The diagram below illustrates how parts in a standard [MergeTree](/engines/table-engines/mergetree-family/mergetree) table are merged: diff --git a/docs/managing-data/core-concepts/partitions.md b/docs/managing-data/core-concepts/partitions.md index 605b3c305d8..d575693393b 100644 --- a/docs/managing-data/core-concepts/partitions.md +++ b/docs/managing-data/core-concepts/partitions.md @@ -10,12 +10,10 @@ import merges_with_partitions from '@site/static/images/managing-data/core-conce import partition_pruning from '@site/static/images/managing-data/core-concepts/partition-pruning.png'; import Image from '@theme/IdealImage'; - ## What are table partitions in ClickHouse? {#what-are-table-partitions-in-clickhouse}
    - Partitions group the [data parts](/parts) of a table in the [MergeTree engine family](/engines/table-engines/mergetree-family) into organized, logical units, which is a way of organizing data that is conceptually meaningful and aligned with specific criteria, such as time ranges, categories, or other key attributes. These logical units make data easier to manage, query, and optimize. ### PARTITION BY {#partition-by} @@ -85,8 +83,6 @@ GROUP BY partition ORDER BY partition ASC; ``` - - ## What are table partitions used for? {#what-are-table-partitions-used-for} ### Data management {#data-management} @@ -108,7 +104,6 @@ TTL date + INTERVAL 12 MONTH DELETE; ``` Since the table is partitioned by `toStartOfMonth(date)`, entire partitions (sets of [table parts](/parts)) that meet the TTL condition will be dropped, making the cleanup operation more efficient, [without having to rewrite parts](/sql-reference/statements/alter#mutations). - Similarly, instead of deleting old data, it can be automatically and efficiently moved to a more cost-effective [storage tier](/integrations/s3#storage-tiers): ```sql @@ -159,7 +154,6 @@ WHERE date >= '2020-12-01' AND date <= '2020-12-31' AND town = 'LONDON'; - ┌─explain──────────────────────────────────────────────────────────────────────────────────────────────────────┐ 1. │ Expression ((Project names + Projection)) │ 2. │ Aggregating │ @@ -186,20 +180,20 @@ WHERE date >= '2020-12-01' 23. │ Parts: 1/1 │ 24. │ Granules: 1/11 │ └──────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` + ``` -The output above shows: + The output above shows: -① Partition pruning: Row 7 to 18 of the EXPLAIN output above show that ClickHouse first uses the `date` field's [MinMax index](/partitions#what-are-table-partitions-in-clickhouse) to identify 11 out of 3257 existing [granules](/guides/best-practices/sparse-primary-indexes#data-is-organized-into-granules-for-parallel-data-processing) (blocks of rows) stored in 1 out of 436 existing active data parts that contain rows matching the query's `date` filter. + ① Partition pruning: Row 7 to 18 of the EXPLAIN output above show that ClickHouse first uses the `date` field's [MinMax index](/partitions#what-are-table-partitions-in-clickhouse) to identify 11 out of 3257 existing [granules](/guides/best-practices/sparse-primary-indexes#data-is-organized-into-granules-for-parallel-data-processing) (blocks of rows) stored in 1 out of 436 existing active data parts that contain rows matching the query's `date` filter. -② Granule pruning: Row 19 to 24 of the EXPLAIN output above indicate that ClickHouse then uses the [primary index](/guides/best-practices/sparse-primary-indexes) (created over the `town`-field) of the data part identified in step ① to further reduce the number of granules (that contain rows potentially also matching the query's `town` filter) from 11 to 1. This is also reflected in the ClickHouse-client output that we printed further above for the query run: + ② Granule pruning: Row 19 to 24 of the EXPLAIN output above indicate that ClickHouse then uses the [primary index](/guides/best-practices/sparse-primary-indexes) (created over the `town`-field) of the data part identified in step ① to further reduce the number of granules (that contain rows potentially also matching the query's `town` filter) from 11 to 1. This is also reflected in the ClickHouse-client output that we printed further above for the query run: -```response -... Elapsed: 0.006 sec. Processed 8.19 thousand rows, 57.34 KB (1.36 million rows/s., 9.49 MB/s.) -Peak memory usage: 2.73 MiB. -``` + ```response + ... Elapsed: 0.006 sec. Processed 8.19 thousand rows, 57.34 KB (1.36 million rows/s., 9.49 MB/s.) + Peak memory usage: 2.73 MiB. + ``` -Meaning that ClickHouse scanned and processed 1 granule (block of [8192](/operations/settings/merge-tree-settings#index_granularity) rows) in 6 milliseconds for calculating the query result. + Meaning that ClickHouse scanned and processed 1 granule (block of [8192](/operations/settings/merge-tree-settings#index_granularity) rows) in 6 milliseconds for calculating the query result. ### Partitioning is primarily a data management feature {#partitioning-is-primarily-a-data-management-feature} @@ -231,7 +225,6 @@ GROUP BY table; ``` As shown further above, the partitioned table `uk_price_paid_simple_partitioned` has over 600 partitions, and therefore at 600 306 active data parts. Whereas for our non-partitioned table `uk_price_paid_simple` all [initial](/parts) data parts could be merged into a single active part by background merges. - When we [check](https://sql.clickhouse.com/?query=RVhQTEFJTiBpbmRleGVzID0gMQpTRUxFQ1QgTUFYKHByaWNlKSBBUyBoaWdoZXN0X3ByaWNlCkZST00gdWsudWtfcHJpY2VfcGFpZF9zaW1wbGVfcGFydGl0aW9uZWQKV0hFUkUgdG93biA9ICdMT05ET04nOw&run_query=true&tab=results) the physical query execution plan with an [EXPLAIN](/sql-reference/statements/explain) clause for our example query from above without the partition filter running over the partitioned table, we can see in row 19 and 20 of the output below that ClickHouse identified 671 out of 3257 existing [granules](/guides/best-practices/sparse-primary-indexes#data-is-organized-into-granules-for-parallel-data-processing) (blocks of rows) spread over 431 out of 436 existing active data parts that potentially contain rows matching the query's filter, and therefore will be scanned and processed by the query engine: ```sql @@ -240,7 +233,6 @@ SELECT MAX(price) AS highest_price FROM uk.uk_price_paid_simple_partitioned WHERE town = 'LONDON'; - ┌─explain─────────────────────────────────────────────────────────┐ 1. │ Expression ((Project names + Projection)) │ 2. │ Aggregating │ @@ -263,60 +255,58 @@ WHERE town = 'LONDON'; 19. │ Parts: 431/436 │ 20. │ Granules: 671/3257 │ └─────────────────────────────────────────────────────────────────┘ -``` + ``` -The physical query execution plan for the same example query running over the table without partitions [shows](https://sql.clickhouse.com/?query=RVhQTEFJTiBpbmRleGVzID0gMQpTRUxFQ1QgTUFYKHByaWNlKSBBUyBoaWdoZXN0X3ByaWNlCkZST00gdWsudWtfcHJpY2VfcGFpZF9zaW1wbGUKV0hFUkUgdG93biA9ICdMT05ET04nOw&run_query=true&tab=results) in row 11 and 12 of the output below that ClickHouse identified 241 out of 3083 existing blocks of rows within the table's single active data part that potentially contain rows matching the query's filter: - -```sql -EXPLAIN indexes = 1 -SELECT MAX(price) AS highest_price -FROM uk.uk_price_paid_simple -WHERE town = 'LONDON'; + The physical query execution plan for the same example query running over the table without partitions [shows](https://sql.clickhouse.com/?query=RVhQTEFJTiBpbmRleGVzID0gMQpTRUxFQ1QgTUFYKHByaWNlKSBBUyBoaWdoZXN0X3ByaWNlCkZST00gdWsudWtfcHJpY2VfcGFpZF9zaW1wbGUKV0hFUkUgdG93biA9ICdMT05ET04nOw&run_query=true&tab=results) in row 11 and 12 of the output below that ClickHouse identified 241 out of 3083 existing blocks of rows within the table's single active data part that potentially contain rows matching the query's filter: + ```sql + EXPLAIN indexes = 1 + SELECT MAX(price) AS highest_price + FROM uk.uk_price_paid_simple + WHERE town = 'LONDON'; ┌─explain───────────────────────────────────────────────┐ - 1. │ Expression ((Project names + Projection)) │ - 2. │ Aggregating │ - 3. │ Expression (Before GROUP BY) │ - 4. │ Expression │ - 5. │ ReadFromMergeTree (uk.uk_price_paid_simple) │ - 6. │ Indexes: │ - 7. │ PrimaryKey │ - 8. │ Keys: │ - 9. │ town │ + 1. │ Expression ((Project names + Projection)) │ + 2. │ Aggregating │ + 3. │ Expression (Before GROUP BY) │ + 4. │ Expression │ + 5. │ ReadFromMergeTree (uk.uk_price_paid_simple) │ + 6. │ Indexes: │ + 7. │ PrimaryKey │ + 8. │ Keys: │ + 9. │ town │ 10. │ Condition: (town in ['LONDON', 'LONDON']) │ 11. │ Parts: 1/1 │ 12. │ Granules: 241/3083 │ └───────────────────────────────────────────────────────┘ -``` + ``` -For [running](https://sql.clickhouse.com/?query=U0VMRUNUIE1BWChwcmljZSkgQVMgaGlnaGVzdF9wcmljZQpGUk9NIHVrLnVrX3ByaWNlX3BhaWRfc2ltcGxlX3BhcnRpdGlvbmVkCldIRVJFIHRvd24gPSAnTE9ORE9OJzs&run_query=true&tab=results) the query over the partitioned version of the table, ClickHouse scans and processes 671 blocks of rows (~ 5.5 million rows) in 90 milliseconds: + For [running](https://sql.clickhouse.com/?query=U0VMRUNUIE1BWChwcmljZSkgQVMgaGlnaGVzdF9wcmljZQpGUk9NIHVrLnVrX3ByaWNlX3BhaWRfc2ltcGxlX3BhcnRpdGlvbmVkCldIRVJFIHRvd24gPSAnTE9ORE9OJzs&run_query=true&tab=results) the query over the partitioned version of the table, ClickHouse scans and processes 671 blocks of rows (~ 5.5 million rows) in 90 milliseconds: -```sql -SELECT MAX(price) AS highest_price -FROM uk.uk_price_paid_simple_partitioned -WHERE town = 'LONDON'; + ```sql + SELECT MAX(price) AS highest_price + FROM uk.uk_price_paid_simple_partitioned + WHERE town = 'LONDON'; -┌─highest_price─┐ -│ 594300000 │ -- 594.30 million -└───────────────┘ - -1 row in set. Elapsed: 0.090 sec. Processed 5.48 million rows, 27.95 MB (60.66 million rows/s., 309.51 MB/s.) -Peak memory usage: 163.44 MiB. -``` + ┌─highest_price─┐ + │ 594300000 │ -- 594.30 million + └───────────────┘ + 1 row in set. Elapsed: 0.090 sec. Processed 5.48 million rows, 27.95 MB (60.66 million rows/s., 309.51 MB/s.) + Peak memory usage: 163.44 MiB. + ``` -Whereas for [running](https://sql.clickhouse.com/?query=U0VMRUNUIE1BWChwcmljZSkgQVMgaGlnaGVzdF9wcmljZQpGUk9NIHVrLnVrX3ByaWNlX3BhaWRfc2ltcGxlCldIRVJFIHRvd24gPSAnTE9ORE9OJzs&run_query=true&tab=results) the query over the non-partitioned table, ClickHouse scans and processes 241 blocks (~ 2 million rows) of rows in 12 milliseconds: + Whereas for [running](https://sql.clickhouse.com/?query=U0VMRUNUIE1BWChwcmljZSkgQVMgaGlnaGVzdF9wcmljZQpGUk9NIHVrLnVrX3ByaWNlX3BhaWRfc2ltcGxlCldIRVJFIHRvd24gPSAnTE9ORE9OJzs&run_query=true&tab=results) the query over the non-partitioned table, ClickHouse scans and processes 241 blocks (~ 2 million rows) of rows in 12 milliseconds: -```sql -SELECT MAX(price) AS highest_price -FROM uk.uk_price_paid_simple -WHERE town = 'LONDON'; + ```sql + SELECT MAX(price) AS highest_price + FROM uk.uk_price_paid_simple + WHERE town = 'LONDON'; -┌─highest_price─┐ -│ 594300000 │ -- 594.30 million -└───────────────┘ + ┌─highest_price─┐ + │ 594300000 │ -- 594.30 million + └───────────────┘ -1 row in set. Elapsed: 0.012 sec. Processed 1.97 million rows, 9.87 MB (162.23 million rows/s., 811.17 MB/s.) -Peak memory usage: 62.02 MiB. -``` + 1 row in set. Elapsed: 0.012 sec. Processed 1.97 million rows, 9.87 MB (162.23 million rows/s., 811.17 MB/s.) + Peak memory usage: 62.02 MiB. + ``` diff --git a/docs/managing-data/core-concepts/parts.md b/docs/managing-data/core-concepts/parts.md index 61f7e8ff4a1..31815f3d2a7 100644 --- a/docs/managing-data/core-concepts/parts.md +++ b/docs/managing-data/core-concepts/parts.md @@ -5,7 +5,6 @@ description: 'What are data parts in ClickHouse' keywords: ['part'] --- - import merges from '@site/static/images/managing-data/core-concepts/merges.png'; import part from '@site/static/images/managing-data/core-concepts/part.png'; import Image from '@theme/IdealImage'; @@ -18,7 +17,6 @@ The data from each table in the ClickHouse [MergeTree engine family](/engines/ta To illustrate this, we use [this](https://sql.clickhouse.com/?query=U0hPVyBDUkVBVEUgVEFCTEUgdWsudWtfcHJpY2VfcGFpZF9zaW1wbGU&run_query=true&tab=results) table (adapted from the [UK property prices dataset](/getting-started/example-datasets/uk-price-paid)) tracking the date, town, street, and price for sold properties in the United Kingdom: - ```sql CREATE TABLE uk.uk_price_paid_simple ( @@ -78,27 +76,26 @@ ORDER BY _part ASC; 2. │ all_12_17_1 │ 3. │ all_18_23_1 │ 4. │ all_6_11_1 │ - └─────────────┘ -``` -The query above retrieves the names of directories on disk, with each directory representing an active data part of the table. The components of these directory names have specific meanings, which are documented [here](https://github.com/ClickHouse/ClickHouse/blob/f90551824bb90ade2d8a1d8edd7b0a3c0a459617/src/Storages/MergeTree/MergeTreeData.h#L130) for those interested in exploring further. + └─────────────┘ + ``` + The query above retrieves the names of directories on disk, with each directory representing an active data part of the table. The components of these directory names have specific meanings, which are documented [here](https://github.com/ClickHouse/ClickHouse/blob/f90551824bb90ade2d8a1d8edd7b0a3c0a459617/src/Storages/MergeTree/MergeTreeData.h#L130) for those interested in exploring further. -Alternatively, ClickHouse tracks info for all parts of all tables in the [system.parts](/operations/system-tables/parts) system table, and the following query [returns](https://sql.clickhouse.com/?query=U0VMRUNUCiAgICBuYW1lLAogICAgbGV2ZWwsCiAgICByb3dzCkZST00gc3lzdGVtLnBhcnRzCldIRVJFIChkYXRhYmFzZSA9ICd1aycpIEFORCAoYHRhYmxlYCA9ICd1a19wcmljZV9wYWlkX3NpbXBsZScpIEFORCBhY3RpdmUKT1JERVIgQlkgbmFtZSBBU0M7&run_query=true&tab=results) for our example table above the list of all currently active parts, their merge level, and the number of rows stored in these parts: + Alternatively, ClickHouse tracks info for all parts of all tables in the [system.parts](/operations/system-tables/parts) system table, and the following query [returns](https://sql.clickhouse.com/?query=U0VMRUNUCiAgICBuYW1lLAogICAgbGV2ZWwsCiAgICByb3dzCkZST00gc3lzdGVtLnBhcnRzCldIRVJFIChkYXRhYmFzZSA9ICd1aycpIEFORCAoYHRhYmxlYCA9ICd1a19wcmljZV9wYWlkX3NpbXBsZScpIEFORCBhY3RpdmUKT1JERVIgQlkgbmFtZSBBU0M7&run_query=true&tab=results) for our example table above the list of all currently active parts, their merge level, and the number of rows stored in these parts: -```sql -SELECT + ```sql + SELECT name, level, rows -FROM system.parts -WHERE (database = 'uk') AND (`table` = 'uk_price_paid_simple') AND active -ORDER BY name ASC; - + FROM system.parts + WHERE (database = 'uk') AND (`table` = 'uk_price_paid_simple') AND active + ORDER BY name ASC; - ┌─name────────┬─level─┬────rows─┐ + ┌─name────────┬─level─┬────rows─┐ 1. │ all_0_5_1 │ 1 │ 6368414 │ 2. │ all_12_17_1 │ 1 │ 6442494 │ 3. │ all_18_23_1 │ 1 │ 5977762 │ 4. │ all_6_11_1 │ 1 │ 6459763 │ - └─────────────┴───────┴─────────┘ -``` -The merge level is incremented by one with each additional merge on the part. A level of 0 indicates this is a new part that has not been merged yet. + └─────────────┴───────┴─────────┘ + ``` + The merge level is incremented by one with each additional merge on the part. A level of 0 indicates this is a new part that has not been merged yet. diff --git a/docs/managing-data/core-concepts/primary-indexes.md b/docs/managing-data/core-concepts/primary-indexes.md index 8163a1405c7..348e232b8e3 100644 --- a/docs/managing-data/core-concepts/primary-indexes.md +++ b/docs/managing-data/core-concepts/primary-indexes.md @@ -5,23 +5,18 @@ description: 'How does the sparse primary index work in ClickHouse' keywords: ['sparse primary index', 'primary index', 'index'] --- - import visual01 from '@site/static/images/managing-data/core-concepts/primary-index-light_01.gif'; import visual02 from '@site/static/images/managing-data/core-concepts/primary-index-light_02.gif'; import visual03 from '@site/static/images/managing-data/core-concepts/primary-index-light_03.gif'; import Image from '@theme/IdealImage'; - :::tip Looking for advanced indexing details? This page introduces ClickHouse's sparse primary index, how it's built, how it works, and how it helps accelerate queries. For advanced indexing strategies and deeper technical detail, see the [primary indexes deep dive](/guides/best-practices/sparse-primary-indexes). ::: - - - ## How does the sparse primary index work in ClickHouse? {#how-does-the-sparse-primary-index-work-in-clickHouse}
    @@ -48,7 +43,6 @@ This granule structure is also what makes the primary index **sparse**: instead Thanks to its sparseness, the primary index is small enough to fit entirely in memory, enabling fast filtering for queries with predicates on primary key columns. In the next section, we show how it helps accelerate such queries. - ### Primary index usage {#primary-index-usage} We sketch how the sparse primary index is used for query acceleration with another animation: @@ -65,7 +59,6 @@ We sketch how the sparse primary index is used for query acceleration with anoth ④ These potentially relevant granules are then loaded and [processed](/optimize/query-parallelism) in memory, along with the corresponding granules from any other columns required for the query. - ## Monitoring primary indexes {#monitoring-primary-indexes} Each [data part](/parts) in the table has its own primary index. We can inspect the contents of these indexes using the [mergeTreeIndex](/sql-reference/table-functions/mergeTreeIndex) table function. @@ -80,123 +73,114 @@ FROM mergeTreeIndex('uk', 'uk_price_paid_simple') GROUP BY part_name; ``` - ```txt ┌─part_name─┬─entries─┐ 1. │ all_2_2_0 │ 914 │ 2. │ all_1_1_0 │ 1343 │ 3. │ all_0_0_0 │ 1349 │ - └───────────┴─────────┘ -``` + └───────────┴─────────┘ + ``` -This query shows the first 10 entries from the primary index of one of the current data parts. Note that these parts are continuously [merged](/merges) in the background into larger parts: + This query shows the first 10 entries from the primary index of one of the current data parts. Note that these parts are continuously [merged](/merges) in the background into larger parts: -```sql -SELECT + ```sql + SELECT mark_number + 1 AS entry, town, street -FROM mergeTreeIndex('uk', 'uk_price_paid_simple') -WHERE part_name = (SELECT any(part_name) FROM mergeTreeIndex('uk', 'uk_price_paid_simple')) -ORDER BY mark_number ASC -LIMIT 10; -``` - + FROM mergeTreeIndex('uk', 'uk_price_paid_simple') + WHERE part_name = (SELECT any(part_name) FROM mergeTreeIndex('uk', 'uk_price_paid_simple')) + ORDER BY mark_number ASC + LIMIT 10; + ``` -```txt + ```txt ┌─entry─┬─town───────────┬─street───────────┐ - 1. │ 1 │ ABBOTS LANGLEY │ ABBEY DRIVE │ - 2. │ 2 │ ABERDARE │ RICHARDS TERRACE │ - 3. │ 3 │ ABERGELE │ PEN Y CAE │ - 4. │ 4 │ ABINGDON │ CHAMBRAI CLOSE │ - 5. │ 5 │ ABINGDON │ THORNLEY CLOSE │ - 6. │ 6 │ ACCRINGTON │ MAY HILL CLOSE │ - 7. │ 7 │ ADDLESTONE │ HARE HILL │ - 8. │ 8 │ ALDEBURGH │ LINDEN ROAD │ - 9. │ 9 │ ALDERSHOT │ HIGH STREET │ + 1. │ 1 │ ABBOTS LANGLEY │ ABBEY DRIVE │ + 2. │ 2 │ ABERDARE │ RICHARDS TERRACE │ + 3. │ 3 │ ABERGELE │ PEN Y CAE │ + 4. │ 4 │ ABINGDON │ CHAMBRAI CLOSE │ + 5. │ 5 │ ABINGDON │ THORNLEY CLOSE │ + 6. │ 6 │ ACCRINGTON │ MAY HILL CLOSE │ + 7. │ 7 │ ADDLESTONE │ HARE HILL │ + 8. │ 8 │ ALDEBURGH │ LINDEN ROAD │ + 9. │ 9 │ ALDERSHOT │ HIGH STREET │ 10. │ 10 │ ALFRETON │ ALMA STREET │ └───────┴────────────────┴──────────────────┘ -``` + ``` -Lastly, we use the [EXPLAIN](/sql-reference/statements/explain) clause to see how the primary indexes of all data parts are used to skip granules that can't possibly contain rows matching the example query's predicates. These granules are excluded from loading and processing: -```sql -EXPLAIN indexes = 1 -SELECT + Lastly, we use the [EXPLAIN](/sql-reference/statements/explain) clause to see how the primary indexes of all data parts are used to skip granules that can't possibly contain rows matching the example query's predicates. These granules are excluded from loading and processing: + ```sql + EXPLAIN indexes = 1 + SELECT max(price) -FROM + FROM uk.uk_price_paid_simple -WHERE + WHERE town = 'LONDON' AND street = 'OXFORD STREET'; -``` - + ``` -```txt + ```txt ┌─explain────────────────────────────────────────────────────────────────────────────────────────────────────┐ - 1. │ Expression ((Project names + Projection)) │ - 2. │ Aggregating │ - 3. │ Expression (Before GROUP BY) │ - 4. │ Expression │ - 5. │ ReadFromMergeTree (uk.uk_price_paid_simple) │ - 6. │ Indexes: │ - 7. │ PrimaryKey │ - 8. │ Keys: │ - 9. │ town │ + 1. │ Expression ((Project names + Projection)) │ + 2. │ Aggregating │ + 3. │ Expression (Before GROUP BY) │ + 4. │ Expression │ + 5. │ ReadFromMergeTree (uk.uk_price_paid_simple) │ + 6. │ Indexes: │ + 7. │ PrimaryKey │ + 8. │ Keys: │ + 9. │ town │ 10. │ street │ 11. │ Condition: and((street in ['OXFORD STREET', 'OXFORD STREET']), (town in ['LONDON', 'LONDON'])) │ 12. │ Parts: 3/3 │ 13. │ Granules: 3/3609 │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` + ``` + Note how row 13 of the EXPLAIN output above shows that only 3 out of 3,609 granules across all data parts were selected by the primary index analysis for processing. The remaining granules were skipped entirely. -Note how row 13 of the EXPLAIN output above shows that only 3 out of 3,609 granules across all data parts were selected by the primary index analysis for processing. The remaining granules were skipped entirely. + We can also observe that most of the data was skipped by simply running the query: + ```sql + SELECT max(price) + FROM uk.uk_price_paid_simple + WHERE (town = 'LONDON') AND (street = 'OXFORD STREET'); + ``` -We can also observe that most of the data was skipped by simply running the query: -```sql -SELECT max(price) -FROM uk.uk_price_paid_simple -WHERE (town = 'LONDON') AND (street = 'OXFORD STREET'); -``` - - -```txt - ┌─max(price)─┐ + ```txt + ┌─max(price)─┐ 1. │ 263100000 │ -- 263.10 million - └────────────┘ + └────────────┘ -1 row in set. Elapsed: 0.010 sec. Processed 24.58 thousand rows, 159.04 KB (2.53 million rows/s., 16.35 MB/s.) -Peak memory usage: 13.00 MiB. -``` + 1 row in set. Elapsed: 0.010 sec. Processed 24.58 thousand rows, 159.04 KB (2.53 million rows/s., 16.35 MB/s.) + Peak memory usage: 13.00 MiB. + ``` -As shown above, only around 25,000 rows were processed out of approximately 30 million rows in the example table: -```sql -SELECT count() FROM uk.uk_price_paid_simple; -``` + As shown above, only around 25,000 rows were processed out of approximately 30 million rows in the example table: + ```sql + SELECT count() FROM uk.uk_price_paid_simple; + ``` -```txt - ┌──count()─┐ + ```txt + ┌──count()─┐ 1. │ 29556244 │ -- 29.56 million - └──────────┘ -``` + └──────────┘ + ``` ## Key takeaways {#key-takeaways} -* **Sparse primary indexes** help ClickHouse skip unnecessary data by identifying which granules might contain rows matching query conditions on primary key columns. +* **Sparse primary indexes** help ClickHouse skip unnecessary data by identifying which granules might contain rows matching query conditions on primary key columns. -* Each index stores only the primary key values from the **first row of every granule** (a granule has 8,192 rows by default), making it compact enough to fit in memory. +* Each index stores only the primary key values from the **first row of every granule** (a granule has 8,192 rows by default), making it compact enough to fit in memory. -* **Each data part** in a MergeTree table has its **own primary index**, which is used independently during query execution. +* **Each data part** in a MergeTree table has its **own primary index**, which is used independently during query execution. -* During queries, the index allows ClickHouse to **skip granules**, reducing I/O and memory usage while accelerating performance. +* During queries, the index allows ClickHouse to **skip granules**, reducing I/O and memory usage while accelerating performance. * You can **inspect index contents** using the `mergeTreeIndex` table function and monitor index usage with the `EXPLAIN` clause. - ## Where to find more information {#where-to-find-more-information} For a deeper look at how sparse primary indexes work in ClickHouse, including how they differ from traditional database indexes and best practices for using them, check out our detailed indexing [deep dive](/guides/best-practices/sparse-primary-indexes). If you're interested in how ClickHouse processes data selected by the primary index scan in a highly parallel way, see the query parallelism guide [here](/optimize/query-parallelism). - - - diff --git a/docs/managing-data/core-concepts/shards.md b/docs/managing-data/core-concepts/shards.md index 431c0d06270..c6adaea27e3 100644 --- a/docs/managing-data/core-concepts/shards.md +++ b/docs/managing-data/core-concepts/shards.md @@ -37,7 +37,6 @@ Each shard holds a subset of the data and functions as a regular ClickHouse tabl To illustrate **SELECT** query forwarding and **INSERT** routing, we consider the [What are table parts](/parts) example table split across two shards on two ClickHouse servers. First, we show the DDL statement for creating a corresponding **Distributed table** for this setup: - ```sql CREATE TABLE uk.uk_price_paid_simple_dist ON CLUSTER test_cluster ( @@ -79,7 +78,6 @@ This diagram shows how SELECT queries are processed with a distributed table in ② The Distributed table forwards the query to all servers hosting shards of the target table, where each ClickHouse server computes its local aggregation result **in parallel**. - Then, the ClickHouse server hosting the initially targeted distributed table ③ collects all local results, ④ merges them into the final global result, and ⑤ returns it to the query sender. ## What are table replicas in ClickHouse? {#what-are-table-replicas-in-clickhouse} @@ -106,8 +104,6 @@ The rest works the [same](#select-forwarding) as in setups without replicas and Note that ClickHouse allows configuring the query forwarding strategy for ②. By default—unlike in the diagram above—the distributed table [prefers](/docs/operations/settings/settings#prefer_localhost_replica) a local replica if available, but other load balancing [strategies](/docs/operations/settings/settings#load_balancing) can be used. - - ## Where to find more information {#where-to-find-more-information} For more details beyond this high-level introduction to table shards and replicas, check out our [deployment and scaling guide](/docs/architecture/horizontal-scaling). diff --git a/docs/managing-data/deleting-data/index.md b/docs/managing-data/deleting-data/index.md index 718507f9671..3103a9cedbb 100644 --- a/docs/managing-data/deleting-data/index.md +++ b/docs/managing-data/deleting-data/index.md @@ -15,4 +15,3 @@ we will explore how to delete data in ClickHouse. | [Delete mutations](/managing-data/delete_mutations) | Learn about Delete Mutations. | | [Truncate table](/managing-data/truncate) | Learn about how to use Truncate, which allows the data in a table or database to be removed, while preserving its existence. | | [Drop partitions](/managing-data/drop_partition) | Learn about Dropping Partitions in ClickHouse. | - diff --git a/docs/managing-data/deleting-data/overview.md b/docs/managing-data/deleting-data/overview.md index ea6e1e09f61..ab662e3cb58 100644 --- a/docs/managing-data/deleting-data/overview.md +++ b/docs/managing-data/deleting-data/overview.md @@ -35,7 +35,7 @@ Read more about [lightweight deletes](/guides/developer/lightweight-delete). ## Delete mutations {#delete-mutations} -Delete mutations can be issued through a `ALTER TABLE ... DELETE` command e.g. +Delete mutations can be issued through a `ALTER TABLE ... DELETE` command e.g. ```sql -- delete all data from 2018 with a mutation. Not recommended. diff --git a/docs/managing-data/updating-data/overview.md b/docs/managing-data/updating-data/overview.md index a1e0bb39300..f39e94760b6 100644 --- a/docs/managing-data/updating-data/overview.md +++ b/docs/managing-data/updating-data/overview.md @@ -41,7 +41,7 @@ Read more about [update mutations](/sql-reference/statements/alter/update). ## Lightweight updates {#lightweight-updates} -Lightweight updates are a ClickHouse feature that updates rows using "patch parts" - special data parts containing only the updated columns and rows, rather than rewriting entire columns like traditional mutations. The Lightweight UPDATE +Lightweight updates are a ClickHouse feature that updates rows using "patch parts" - special data parts containing only the updated columns and rows, rather than rewriting entire columns like traditional mutations. The Lightweight UPDATE Key characteristics: - Uses the standard `UPDATE` syntax and creates patch parts immediately without waiting for merges @@ -49,7 +49,7 @@ Key characteristics: - Designed for small updates (up to ~10% of table) with predictable latency - Adds overhead to `SELECT` queries that need to apply patches, but avoids rewriting entire columns -For more details see ["The Lightweight UPDATE Statement"](/sql-reference/statements/update) + For more details see ["The Lightweight UPDATE Statement"](/sql-reference/statements/update) ## On-the-fly Updates {#on-the-fly-updates} diff --git a/docs/materialized-view/incremental-materialized-view.md b/docs/materialized-view/incremental-materialized-view.md index 50bb492fbb0..15e49d7e688 100644 --- a/docs/materialized-view/incremental-materialized-view.md +++ b/docs/materialized-view/incremental-materialized-view.md @@ -132,46 +132,46 @@ Since the merging of rows is asynchronous, there may be more than one vote per d - Use the `FINAL` modifier on the table name. We did this for the count query above. - Aggregate by the ordering key used in our final table i.e. `CreationDate` and sum the metrics. Typically this is more efficient and flexible (the table can be used for other things), but the former can be simpler for some queries. We show both below: -```sql -SELECT + ```sql + SELECT Day, UpVotes, DownVotes -FROM up_down_votes_per_day -FINAL -ORDER BY Day ASC -LIMIT 10 - -10 rows in set. Elapsed: 0.004 sec. Processed 8.97 thousand rows, 89.68 KB (2.09 million rows/s., 20.89 MB/s.) -Peak memory usage: 289.75 KiB. - -SELECT Day, sum(UpVotes) AS UpVotes, sum(DownVotes) AS DownVotes -FROM up_down_votes_per_day -GROUP BY Day -ORDER BY Day ASC -LIMIT 10 -┌────────Day─┬─UpVotes─┬─DownVotes─┐ -│ 2008-07-31 │ 6 │ 0 │ -│ 2008-08-01 │ 182 │ 50 │ -│ 2008-08-02 │ 436 │ 107 │ -│ 2008-08-03 │ 564 │ 100 │ -│ 2008-08-04 │ 1306 │ 259 │ -│ 2008-08-05 │ 1368 │ 269 │ -│ 2008-08-06 │ 1701 │ 211 │ -│ 2008-08-07 │ 1544 │ 211 │ -│ 2008-08-08 │ 1241 │ 212 │ -│ 2008-08-09 │ 576 │ 46 │ -└────────────┴─────────┴───────────┘ - -10 rows in set. Elapsed: 0.010 sec. Processed 8.97 thousand rows, 89.68 KB (907.32 thousand rows/s., 9.07 MB/s.) -Peak memory usage: 567.61 KiB. -``` - -This has sped up our query from 0.133s to 0.004s – an over 25x improvement! - -:::important Important: `ORDER BY` = `GROUP BY` -In most cases the columns used in the `GROUP BY` clause of the Materialized Views transformation, should be consistent with those used in the `ORDER BY` clause of the target table if using the `SummingMergeTree` or `AggregatingMergeTree` table engines. These engines rely on the `ORDER BY` columns to merge rows with identical values during background merge operations. Misalignment between `GROUP BY` and `ORDER BY` columns can lead to inefficient query performance, suboptimal merges, or even data discrepancies. -::: + FROM up_down_votes_per_day + FINAL + ORDER BY Day ASC + LIMIT 10 + + 10 rows in set. Elapsed: 0.004 sec. Processed 8.97 thousand rows, 89.68 KB (2.09 million rows/s., 20.89 MB/s.) + Peak memory usage: 289.75 KiB. + + SELECT Day, sum(UpVotes) AS UpVotes, sum(DownVotes) AS DownVotes + FROM up_down_votes_per_day + GROUP BY Day + ORDER BY Day ASC + LIMIT 10 + ┌────────Day─┬─UpVotes─┬─DownVotes─┐ + │ 2008-07-31 │ 6 │ 0 │ + │ 2008-08-01 │ 182 │ 50 │ + │ 2008-08-02 │ 436 │ 107 │ + │ 2008-08-03 │ 564 │ 100 │ + │ 2008-08-04 │ 1306 │ 259 │ + │ 2008-08-05 │ 1368 │ 269 │ + │ 2008-08-06 │ 1701 │ 211 │ + │ 2008-08-07 │ 1544 │ 211 │ + │ 2008-08-08 │ 1241 │ 212 │ + │ 2008-08-09 │ 576 │ 46 │ + └────────────┴─────────┴───────────┘ + + 10 rows in set. Elapsed: 0.010 sec. Processed 8.97 thousand rows, 89.68 KB (907.32 thousand rows/s., 9.07 MB/s.) + Peak memory usage: 567.61 KiB. + ``` + + This has sped up our query from 0.133s to 0.004s – an over 25x improvement! + + :::important Important: `ORDER BY` = `GROUP BY` + In most cases the columns used in the `GROUP BY` clause of the Materialized Views transformation, should be consistent with those used in the `ORDER BY` clause of the target table if using the `SummingMergeTree` or `AggregatingMergeTree` table engines. These engines rely on the `ORDER BY` columns to merge rows with identical values during background merge operations. Misalignment between `GROUP BY` and `ORDER BY` columns can lead to inefficient query performance, suboptimal merges, or even data discrepancies. + ::: ### A more complex example {#a-more-complex-example} @@ -337,7 +337,6 @@ CREATE TABLE comments_posts_users ( UserId Int32 ) ENGINE = MergeTree ORDER BY UserId - CREATE TABLE comments_null AS comments ENGINE = Null @@ -381,7 +380,7 @@ Incremental Materialized views in ClickHouse fully support `JOIN` operations, bu When an Incremental materialized view is defined using a `JOIN`, the left-most table in the `SELECT` query acts as the source. When new rows are inserted into this table, ClickHouse executes the materialized view query *only* with those newly inserted rows. Right-side tables in the JOIN are read in full during this execution, but changes to them alone do not trigger the view. -This behavior makes JOINs in Materialized Views similar to a snapshot join against static dimension data. +This behavior makes JOINs in Materialized Views similar to a snapshot join against static dimension data. This works well for enriching data with reference or dimension tables. However, any updates to the right-side tables (e.g., user metadata) will not retroactively update the materialized view. To see updated data, new inserts must arrive in the source table. @@ -520,7 +519,7 @@ WHERE DisplayName = 'gingerwizard' Notice the latency of the insert here. The inserted user row is joined against the entire `users` table, significantly impacting insert performance. We propose approaches to address this below in ["Using source table in filters and joins"](/materialized-view/incremental-materialized-view#using-source-table-in-filters-and-joins-in-materialized-views). ::: -Conversely, if we insert a badge for a new user, followed by the row for the user, our materialized view will fail to capture the users' metrics. +Conversely, if we insert a badge for a new user, followed by the row for the user, our materialized view will fail to capture the users' metrics. ```sql INSERT INTO badges VALUES (53505059, 23923286, 'Good Answer', now(), 'Bronze', 0); @@ -593,7 +592,6 @@ AS SELECT count(*) AS c0 FROM t0 LEFT JOIN ( SELECT * FROM t0 ) AS x ON t0.c0 = x.c0; - CREATE MATERIALIZED VIEW mvw2 TO mvw2_inner AS SELECT count(*) AS c0 FROM t0 @@ -758,7 +756,6 @@ ORDER BY UserId These can be populated with the following `INSERT INTO` commands: - ```sql INSERT INTO stackoverflow.badges SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/badges.parquet') @@ -840,7 +837,6 @@ ORDER BY last_activity DESC While this is valid syntactically, it will produce unintended results - the view will only trigger inserts to the `comments` table. For example: - ```sql INSERT INTO comments VALUES (99999999, 23121, 1, 'The answer is 42', now(), 2936484, 'gingerwizard'); @@ -1082,22 +1078,21 @@ Enabling `parallel_view_processing=1` can significantly improve insert throughpu - **Increased insert pressure**: All Materialized Views are executed simultaneously, increasing CPU and memory usage. If each view performs heavy computation or JOINs, this can overload the system. - **Need for strict execution order**: In rare workflows where the order of view execution matters (e.g., chained dependencies), parallel execution may lead to inconsistent state or race conditions. While possible to design around this, such setups are fragile and may break with future versions. -:::note Historical defaults and stability -Sequential execution was the default for a long time, in part due to error handling complexities. Historically, a failure in one materialized view could prevent others from executing. Newer versions have improved this by isolating failures per block, but sequential execution still provides clearer failure semantics. -::: + :::note Historical defaults and stability + Sequential execution was the default for a long time, in part due to error handling complexities. Historically, a failure in one materialized view could prevent others from executing. Newer versions have improved this by isolating failures per block, but sequential execution still provides clearer failure semantics. + ::: -In general, enable `parallel_view_processing=1` when: + In general, enable `parallel_view_processing=1` when: - You have multiple independent Materialized Views - You're aiming to maximize insert performance - You're aware of the system's capacity to handle concurrent view execution -Leave it disabled when: + Leave it disabled when: - Materialized Views have dependencies on one another - You require predictable, ordered execution - You're debugging or auditing insert behavior and want deterministic replay - ## Materialized views and Common Table Expressions (CTE) {#materialized-views-common-table-expressions-ctes} **Non-recursive** Common Table Expressions (CTEs) are supported in Materialized Views. @@ -1187,15 +1182,15 @@ In ClickHouse, CTEs are inlined which means they are effectively copy-pasted int - If your CTE references a different table from the source table (i.e., the one the materialized view is attached to), and is used in a `JOIN` or `IN` clause, it will behave like a subquery or join, not a trigger. - The materialized view will still only trigger on inserts into the main source table, but the CTE will be re-executed on every insert, which may cause unnecessary overhead, especially if the referenced table is large. -For example, + For example, -```sql -WITH recent_users AS ( - SELECT Id FROM stackoverflow.users WHERE CreationDate > now() - INTERVAL 7 DAY -) -SELECT * FROM stackoverflow.posts WHERE OwnerUserId IN (SELECT Id FROM recent_users) -``` + ```sql + WITH recent_users AS ( + SELECT Id FROM stackoverflow.users WHERE CreationDate > now() - INTERVAL 7 DAY + ) + SELECT * FROM stackoverflow.posts WHERE OwnerUserId IN (SELECT Id FROM recent_users) + ``` -In this case, the users CTE is re-evaluated on every insert into posts, and the materialized view will not update when new users are inserted - only when posts are. + In this case, the users CTE is re-evaluated on every insert into posts, and the materialized view will not update when new users are inserted - only when posts are. -Generally, use CTEs for logic that operates on the same source table the materialized view is attached to or ensure that referenced tables are small and unlikely to cause performance bottlenecks. Alternatively, consider [the same optimizations as JOINs with Materialized Views](/materialized-view/incremental-materialized-view#join-best-practices). + Generally, use CTEs for logic that operates on the same source table the materialized view is attached to or ensure that referenced tables are small and unlikely to cause performance bottlenecks. Alternatively, consider [the same optimizations as JOINs with Materialized Views](/materialized-view/incremental-materialized-view#join-best-practices). diff --git a/docs/materialized-view/index.md b/docs/materialized-view/index.md index e6b3062f911..6976401a192 100644 --- a/docs/materialized-view/index.md +++ b/docs/materialized-view/index.md @@ -10,5 +10,4 @@ keywords: ['materialized views', 'speed up queries', 'query optimization', 'refr | [Incremental materialized view](/materialized-view/incremental-materialized-view) | Allow users to shift the cost of computation from query time to insert time, resulting in faster `SELECT` queries. | | [Refreshable materialized view](/materialized-view/refreshable-materialized-view) | Conceptually similar to incremental materialized views but require the periodic execution of the query over the full dataset - the results of which are stored in a target table for querying. | - diff --git a/docs/materialized-view/refreshable-materialized-view.md b/docs/materialized-view/refreshable-materialized-view.md index 9b2362b868d..36d7a0b8f2c 100644 --- a/docs/materialized-view/refreshable-materialized-view.md +++ b/docs/materialized-view/refreshable-materialized-view.md @@ -14,12 +14,10 @@ The diagram explains how Refreshable Materialized Views work: Refreshable materialized view diagram - You can also see the following video: - ## When should refreshable materialized views be used? {#when-should-refreshable-materialized-views-be-used} ClickHouse incremental materialized views are enormously powerful and typically scale much better than the approach used by refreshable materialized views, especially in cases where an aggregate over a single table needs to be performed. By only computing the aggregation over each block of data as it is inserted and merging the incremental states in the final table, the query only ever executes on a subset of the data. This method scales to potentially petabytes of data and is usually the preferred method. diff --git a/docs/migrations/bigquery/equivalent-concepts.md b/docs/migrations/bigquery/equivalent-concepts.md index 48352b93c42..4f67af4bded 100644 --- a/docs/migrations/bigquery/equivalent-concepts.md +++ b/docs/migrations/bigquery/equivalent-concepts.md @@ -89,13 +89,13 @@ In ClickHouse, a table can also have a primary key. Like BigQuery, ClickHouse do In addition to the primary index created from the values of a table's primary key columns, ClickHouse allows you to create secondary indexes on columns other than those in the primary key. ClickHouse offers several types of secondary indexes, each suited to different types of queries: - **Bloom Filter Index**: - - Used to speed up queries with equality conditions (e.g., =, IN). - - Uses probabilistic data structures to determine whether a value exists in a data block. + - Used to speed up queries with equality conditions (e.g., =, IN). + - Uses probabilistic data structures to determine whether a value exists in a data block. - **Token Bloom Filter Index**: - - Similar to a Bloom Filter Index but used for tokenized strings and suitable for full-text search queries. + - Similar to a Bloom Filter Index but used for tokenized strings and suitable for full-text search queries. - **Min-Max Index**: - - Maintains the minimum and maximum values of a column for each data part. - - Helps to skip reading data parts that do not fall within the specified range. + - Maintains the minimum and maximum values of a column for each data part. + - Helps to skip reading data parts that do not fall within the specified range. ## Search indexes {#search-indexes} @@ -206,151 +206,151 @@ FROM ) ┌─new_array─┐ 1. │ [1,2,3] │ - └───────────┘ -``` - -**Convert an array into a set of rows** - -_BigQuery_ - -[`UNNEST`](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unnest_operator) operator - -```sql -SELECT * -FROM UNNEST(['foo', 'bar', 'baz', 'qux', 'corge', 'garply', 'waldo', 'fred']) - AS element -WITH OFFSET AS offset -ORDER BY offset; - -/*----------+--------* - | element | offset | - +----------+--------+ - | foo | 0 | - | bar | 1 | - | baz | 2 | - | qux | 3 | - | corge | 4 | - | garply | 5 | - | waldo | 6 | - | fred | 7 | - *----------+--------*/ -``` - -_ClickHouse_ - -[ARRAY JOIN](/sql-reference/statements/select/array-join) clause - -```sql -WITH ['foo', 'bar', 'baz', 'qux', 'corge', 'garply', 'waldo', 'fred'] AS values -SELECT element, num-1 AS offset -FROM (SELECT values AS element) AS subquery -ARRAY JOIN element, arrayEnumerate(element) AS num; - -/*----------+--------* - | element | offset | - +----------+--------+ - | foo | 0 | - | bar | 1 | - | baz | 2 | - | qux | 3 | - | corge | 4 | - | garply | 5 | - | waldo | 6 | - | fred | 7 | - *----------+--------*/ -``` - -**Return an array of dates** - -_BigQuery_ - -[GENERATE_DATE_ARRAY](https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions#generate_date_array) function - -```sql -SELECT GENERATE_DATE_ARRAY('2016-10-05', '2016-10-08') AS example; - -/*--------------------------------------------------* - | example | - +--------------------------------------------------+ - | [2016-10-05, 2016-10-06, 2016-10-07, 2016-10-08] | - *--------------------------------------------------*/ -``` - -[range](/sql-reference/functions/array-functions#range) + [arrayMap](/sql-reference/functions/array-functions#arrayMap) functions - -_ClickHouse_ - -```sql -SELECT arrayMap(x -> (toDate('2016-10-05') + x), range(toUInt32((toDate('2016-10-08') - toDate('2016-10-05')) + 1))) AS example - - ┌─example───────────────────────────────────────────────┐ + └───────────┘ + ``` + + **Convert an array into a set of rows** + + _BigQuery_ + + [`UNNEST`](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unnest_operator) operator + + ```sql + SELECT * + FROM UNNEST(['foo', 'bar', 'baz', 'qux', 'corge', 'garply', 'waldo', 'fred']) + AS element + WITH OFFSET AS offset + ORDER BY offset; + + /*----------+--------* + | element | offset | + +----------+--------+ + | foo | 0 | + | bar | 1 | + | baz | 2 | + | qux | 3 | + | corge | 4 | + | garply | 5 | + | waldo | 6 | + | fred | 7 | + *----------+--------*/ + ``` + + _ClickHouse_ + + [ARRAY JOIN](/sql-reference/statements/select/array-join) clause + + ```sql + WITH ['foo', 'bar', 'baz', 'qux', 'corge', 'garply', 'waldo', 'fred'] AS values + SELECT element, num-1 AS offset + FROM (SELECT values AS element) AS subquery + ARRAY JOIN element, arrayEnumerate(element) AS num; + + /*----------+--------* + | element | offset | + +----------+--------+ + | foo | 0 | + | bar | 1 | + | baz | 2 | + | qux | 3 | + | corge | 4 | + | garply | 5 | + | waldo | 6 | + | fred | 7 | + *----------+--------*/ + ``` + + **Return an array of dates** + + _BigQuery_ + + [GENERATE_DATE_ARRAY](https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions#generate_date_array) function + + ```sql + SELECT GENERATE_DATE_ARRAY('2016-10-05', '2016-10-08') AS example; + + /*--------------------------------------------------* + | example | + +--------------------------------------------------+ + | [2016-10-05, 2016-10-06, 2016-10-07, 2016-10-08] | + *--------------------------------------------------*/ + ``` + + [range](/sql-reference/functions/array-functions#range) + [arrayMap](/sql-reference/functions/array-functions#arrayMap) functions + + _ClickHouse_ + + ```sql + SELECT arrayMap(x -> (toDate('2016-10-05') + x), range(toUInt32((toDate('2016-10-08') - toDate('2016-10-05')) + 1))) AS example + + ┌─example───────────────────────────────────────────────┐ 1. │ ['2016-10-05','2016-10-06','2016-10-07','2016-10-08'] │ - └───────────────────────────────────────────────────────┘ -``` + └───────────────────────────────────────────────────────┘ + ``` -**Return an array of timestamps** + **Return an array of timestamps** -_BigQuery_ + _BigQuery_ -[GENERATE_TIMESTAMP_ARRAY](https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions#generate_timestamp_array) function + [GENERATE_TIMESTAMP_ARRAY](https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions#generate_timestamp_array) function -```sql -SELECT GENERATE_TIMESTAMP_ARRAY('2016-10-05 00:00:00', '2016-10-07 00:00:00', + ```sql + SELECT GENERATE_TIMESTAMP_ARRAY('2016-10-05 00:00:00', '2016-10-07 00:00:00', INTERVAL 1 DAY) AS timestamp_array; -/*--------------------------------------------------------------------------* - | timestamp_array | - +--------------------------------------------------------------------------+ - | [2016-10-05 00:00:00+00, 2016-10-06 00:00:00+00, 2016-10-07 00:00:00+00] | - *--------------------------------------------------------------------------*/ -``` + /*--------------------------------------------------------------------------* + | timestamp_array | + +--------------------------------------------------------------------------+ + | [2016-10-05 00:00:00+00, 2016-10-06 00:00:00+00, 2016-10-07 00:00:00+00] | + *--------------------------------------------------------------------------*/ + ``` -_ClickHouse_ + _ClickHouse_ -[range](/sql-reference/functions/array-functions#range) + [arrayMap](/sql-reference/functions/array-functions#arrayMap) functions + [range](/sql-reference/functions/array-functions#range) + [arrayMap](/sql-reference/functions/array-functions#arrayMap) functions -```sql -SELECT arrayMap(x -> (toDateTime('2016-10-05 00:00:00') + toIntervalDay(x)), range(dateDiff('day', toDateTime('2016-10-05 00:00:00'), toDateTime('2016-10-07 00:00:00')) + 1)) AS timestamp_array + ```sql + SELECT arrayMap(x -> (toDateTime('2016-10-05 00:00:00') + toIntervalDay(x)), range(dateDiff('day', toDateTime('2016-10-05 00:00:00'), toDateTime('2016-10-07 00:00:00')) + 1)) AS timestamp_array -Query id: b324c11f-655b-479f-9337-f4d34fd02190 + Query id: b324c11f-655b-479f-9337-f4d34fd02190 - ┌─timestamp_array─────────────────────────────────────────────────────┐ + ┌─timestamp_array─────────────────────────────────────────────────────┐ 1. │ ['2016-10-05 00:00:00','2016-10-06 00:00:00','2016-10-07 00:00:00'] │ - └─────────────────────────────────────────────────────────────────────┘ -``` + └─────────────────────────────────────────────────────────────────────┘ + ``` -**Filtering arrays** + **Filtering arrays** -_BigQuery_ + _BigQuery_ -Requires temporarily converting arrays back to tables via [`UNNEST`](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unnest_operator) operator + Requires temporarily converting arrays back to tables via [`UNNEST`](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unnest_operator) operator -```sql -WITH Sequences AS - (SELECT [0, 1, 1, 2, 3, 5] AS some_numbers - UNION ALL SELECT [2, 4, 8, 16, 32] AS some_numbers - UNION ALL SELECT [5, 10] AS some_numbers) -SELECT - ARRAY(SELECT x * 2 + ```sql + WITH Sequences AS + (SELECT [0, 1, 1, 2, 3, 5] AS some_numbers + UNION ALL SELECT [2, 4, 8, 16, 32] AS some_numbers + UNION ALL SELECT [5, 10] AS some_numbers) + SELECT + ARRAY(SELECT x * 2 FROM UNNEST(some_numbers) AS x WHERE x < 5) AS doubled_less_than_five -FROM Sequences; - -/*------------------------* - | doubled_less_than_five | - +------------------------+ - | [0, 2, 2, 4, 6] | - | [4, 8] | - | [] | - *------------------------*/ -``` + FROM Sequences; -_ClickHouse_ + /*------------------------* + | doubled_less_than_five | + +------------------------+ + | [0, 2, 2, 4, 6] | + | [4, 8] | + | [] | + *------------------------*/ + ``` -[arrayFilter](/sql-reference/functions/array-functions#arrayFilter) function + _ClickHouse_ -```sql -WITH Sequences AS + [arrayFilter](/sql-reference/functions/array-functions#arrayFilter) function + + ```sql + WITH Sequences AS ( SELECT [0, 1, 1, 2, 3, 5] AS some_numbers UNION ALL @@ -358,34 +358,34 @@ WITH Sequences AS UNION ALL SELECT [5, 10] AS some_numbers ) -SELECT arrayMap(x -> (x * 2), arrayFilter(x -> (x < 5), some_numbers)) AS doubled_less_than_five -FROM Sequences; - ┌─doubled_less_than_five─┐ + SELECT arrayMap(x -> (x * 2), arrayFilter(x -> (x < 5), some_numbers)) AS doubled_less_than_five + FROM Sequences; + ┌─doubled_less_than_five─┐ 1. │ [0,2,2,4,6] │ - └────────────────────────┘ - ┌─doubled_less_than_five─┐ + └────────────────────────┘ + ┌─doubled_less_than_five─┐ 2. │ [] │ - └────────────────────────┘ - ┌─doubled_less_than_five─┐ + └────────────────────────┘ + ┌─doubled_less_than_five─┐ 3. │ [4,8] │ - └────────────────────────┘ -``` + └────────────────────────┘ + ``` -**Zipping arrays** + **Zipping arrays** -_BigQuery_ + _BigQuery_ -Requires temporarily converting arrays back to tables via [`UNNEST`](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unnest_operator) operator + Requires temporarily converting arrays back to tables via [`UNNEST`](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unnest_operator) operator -```sql -WITH - Combinations AS ( + ```sql + WITH + Combinations AS ( SELECT ['a', 'b'] AS letters, [1, 2, 3] AS numbers - ) -SELECT - ARRAY( + ) + SELECT + ARRAY( SELECT AS STRUCT letters[SAFE_OFFSET(index)] AS letter, numbers[SAFE_OFFSET(index)] AS number @@ -396,65 +396,65 @@ SELECT 0, LEAST(ARRAY_LENGTH(letters), ARRAY_LENGTH(numbers)) - 1)) AS index ORDER BY index - ); - -/*------------------------------* - | pairs | - +------------------------------+ - | [{ letter: "a", number: 1 }, | - | { letter: "b", number: 2 }] | - *------------------------------*/ -``` + ); -_ClickHouse_ + /*------------------------------* + | pairs | + +------------------------------+ + | [{ letter: "a", number: 1 }, | + | { letter: "b", number: 2 }] | + *------------------------------*/ + ``` -[arrayZip](/sql-reference/functions/array-functions#arrayZip) function + _ClickHouse_ -```sql -WITH Combinations AS + [arrayZip](/sql-reference/functions/array-functions#arrayZip) function + + ```sql + WITH Combinations AS ( SELECT ['a', 'b'] AS letters, [1, 2, 3] AS numbers ) -SELECT arrayZip(letters, arrayResize(numbers, length(letters))) AS pairs -FROM Combinations; - ┌─pairs─────────────┐ + SELECT arrayZip(letters, arrayResize(numbers, length(letters))) AS pairs + FROM Combinations; + ┌─pairs─────────────┐ 1. │ [('a',1),('b',2)] │ - └───────────────────┘ -``` + └───────────────────┘ + ``` -**Aggregating arrays** + **Aggregating arrays** -_BigQuery_ + _BigQuery_ -Requires converting arrays back to tables via [`UNNEST`](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unnest_operator) operator + Requires converting arrays back to tables via [`UNNEST`](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unnest_operator) operator -```sql -WITH Sequences AS - (SELECT [0, 1, 1, 2, 3, 5] AS some_numbers - UNION ALL SELECT [2, 4, 8, 16, 32] AS some_numbers - UNION ALL SELECT [5, 10] AS some_numbers) -SELECT some_numbers, - (SELECT SUM(x) - FROM UNNEST(s.some_numbers) AS x) AS sums -FROM Sequences AS s; - -/*--------------------+------* - | some_numbers | sums | - +--------------------+------+ - | [0, 1, 1, 2, 3, 5] | 12 | - | [2, 4, 8, 16, 32] | 62 | - | [5, 10] | 15 | - *--------------------+------*/ -``` + ```sql + WITH Sequences AS + (SELECT [0, 1, 1, 2, 3, 5] AS some_numbers + UNION ALL SELECT [2, 4, 8, 16, 32] AS some_numbers + UNION ALL SELECT [5, 10] AS some_numbers) + SELECT some_numbers, + (SELECT SUM(x) + FROM UNNEST(s.some_numbers) AS x) AS sums + FROM Sequences AS s; -_ClickHouse_ + /*--------------------+------* + | some_numbers | sums | + +--------------------+------+ + | [0, 1, 1, 2, 3, 5] | 12 | + | [2, 4, 8, 16, 32] | 62 | + | [5, 10] | 15 | + *--------------------+------*/ + ``` -[arraySum](/sql-reference/functions/array-functions#arraySum), [arrayAvg](/sql-reference/functions/array-functions#arrayAvg), ... function, or any of the over 90 existing aggregate function names as argument for the [arrayReduce](/sql-reference/functions/array-functions#arrayReduce) function + _ClickHouse_ -```sql -WITH Sequences AS + [arraySum](/sql-reference/functions/array-functions#arraySum), [arrayAvg](/sql-reference/functions/array-functions#arrayAvg), ... function, or any of the over 90 existing aggregate function names as argument for the [arrayReduce](/sql-reference/functions/array-functions#arrayReduce) function + + ```sql + WITH Sequences AS ( SELECT [0, 1, 1, 2, 3, 5] AS some_numbers UNION ALL @@ -462,17 +462,17 @@ WITH Sequences AS UNION ALL SELECT [5, 10] AS some_numbers ) -SELECT + SELECT some_numbers, arraySum(some_numbers) AS sums -FROM Sequences; - ┌─some_numbers──┬─sums─┐ + FROM Sequences; + ┌─some_numbers──┬─sums─┐ 1. │ [0,1,1,2,3,5] │ 12 │ - └───────────────┴──────┘ - ┌─some_numbers──┬─sums─┐ + └───────────────┴──────┘ + ┌─some_numbers──┬─sums─┐ 2. │ [2,4,8,16,32] │ 62 │ - └───────────────┴──────┘ - ┌─some_numbers─┬─sums─┐ + └───────────────┴──────┘ + ┌─some_numbers─┬─sums─┐ 3. │ [5,10] │ 15 │ - └──────────────┴──────┘ -``` + └──────────────┴──────┘ + ``` diff --git a/docs/migrations/bigquery/index.md b/docs/migrations/bigquery/index.md index 51493f28d02..712b098173c 100644 --- a/docs/migrations/bigquery/index.md +++ b/docs/migrations/bigquery/index.md @@ -11,7 +11,6 @@ In this section of the docs, learn more about the similarities and differences b | Page | Description | |-----------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------| -| [BigQuery vs ClickHouse Cloud](./equivalent-concepts.md) | The way resources are organized in ClickHouse Cloud is similar to BigQuery's resource hierarchy. We describe the specific differences in this article. | +| [BigQuery vs ClickHouse Cloud](./equivalent-concepts.md) | The way resources are organized in ClickHouse Cloud is similar to BigQuery's resource hierarchy. We describe the specific differences in this article. | | [Migrating from BigQuery to ClickHouse Cloud](./migrating-to-clickhouse-cloud.md) | Learn about why you might want to migrate from BigQuery to ClickHouse Cloud. | | [Loading Data](./loading-data.md) | A guide showing you how to migrate data from BigQuery to ClickHouse. | - diff --git a/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md b/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md index 8c89305f4ac..df8b6cc087f 100644 --- a/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md +++ b/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md @@ -50,9 +50,9 @@ BigQuery supports exporting data to Google's object store (GCS). For our example 2. Import the data into ClickHouse Cloud. For that we can use the [gcs table function](/sql-reference/table-functions/gcs). The DDL and import queries are available [here](https://pastila.nl/?00531abf/f055a61cc96b1ba1383d618721059976#Wf4Tn43D3VCU5Hx7tbf1Qw==). Note that because a ClickHouse Cloud instance consists of multiple compute nodes, instead of the `gcs` table function, we are using the [s3Cluster table function](/sql-reference/table-functions/s3Cluster) instead. This function also works with gcs buckets and [utilizes all nodes of a ClickHouse Cloud service](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part1#parallel-servers) to load the data in parallel. -Bulk loading + Bulk loading -This approach has a number of advantages: + This approach has a number of advantages: - BigQuery export functionality supports a filter for exporting a subset of data. - BigQuery supports exporting to [Parquet, Avro, JSON, and CSV](https://cloud.google.com/bigquery/docs/exporting-data) formats and several [compression types](https://cloud.google.com/bigquery/docs/exporting-data) - all supported by ClickHouse. @@ -60,7 +60,7 @@ This approach has a number of advantages: - [Google allows up to 50TB per day to be exported to GCS for free](https://cloud.google.com/bigquery/quotas#export_jobs). Users only pay for GCS storage. - Exports produce multiple files automatically, limiting each to a maximum of 1GB of table data. This is beneficial to ClickHouse since it allows imports to be parallelized. -Before trying the following examples, we recommend users review the [permissions required for export](https://cloud.google.com/bigquery/docs/exporting-data#required_permissions) and [locality recommendations](https://cloud.google.com/bigquery/docs/exporting-data#data-locations) to maximize export and import performance. + Before trying the following examples, we recommend users review the [permissions required for export](https://cloud.google.com/bigquery/docs/exporting-data#required_permissions) and [locality recommendations](https://cloud.google.com/bigquery/docs/exporting-data#data-locations) to maximize export and import performance. ### Real-time replication or CDC via scheduled queries {#real-time-replication-or-cdc-via-scheduled-queries} @@ -152,11 +152,11 @@ In contrast to BigQuery, ClickHouse automatically creates [a (sparse) primary in - Memory and disk efficiency are paramount to the scale at which ClickHouse is often used. Data is written to ClickHouse tables in chunks known as parts, with rules applied for merging the parts in the background. In ClickHouse, each part has its own primary index. When parts are merged, then the merged part's primary indexes are also merged. Not that these indexes are not built for each row. Instead, the primary index for a part has one index entry per group of rows - this technique is called sparse indexing. - Sparse indexing is possible because ClickHouse stores the rows for a part on disk ordered by a specified key. Instead of directly locating single rows (like a B-Tree-based index), the sparse primary index allows it to quickly (via a binary search over index entries) identify groups of rows that could possibly match the query. The located groups of potentially matching rows are then, in parallel, streamed into the ClickHouse engine in order to find the matches. This index design allows for the primary index to be small (it completely fits into the main memory) while still significantly speeding up query execution times, especially for range queries that are typical in data analytics use cases. For more details, we recommend [this in-depth guide](/guides/best-practices/sparse-primary-indexes). -ClickHouse Primary keys + ClickHouse Primary keys -The selected primary key in ClickHouse will determine not only the index but also the order in which data is written on disk. Because of this, it can dramatically impact compression levels, which can, in turn, affect query performance. An ordering key that causes the values of most columns to be written in a contiguous order will allow the selected compression algorithm (and codecs) to compress the data more effectively. + The selected primary key in ClickHouse will determine not only the index but also the order in which data is written on disk. Because of this, it can dramatically impact compression levels, which can, in turn, affect query performance. An ordering key that causes the values of most columns to be written in a contiguous order will allow the selected compression algorithm (and codecs) to compress the data more effectively. -> All columns in a table will be sorted based on the value of the specified ordering key, regardless of whether they are included in the key itself. For instance, if `CreationDate` is used as the key, the order of values in all other columns will correspond to the order of values in the `CreationDate` column. Multiple ordering keys can be specified - this will order with the same semantics as an `ORDER BY` clause in a `SELECT` query. + > All columns in a table will be sorted based on the value of the specified ordering key, regardless of whether they are included in the key itself. For instance, if `CreationDate` is used as the key, the order of values in all other columns will correspond to the order of values in the `CreationDate` column. Multiple ordering keys can be specified - this will order with the same semantics as an `ORDER BY` clause in a `SELECT` query. ### Choosing an ordering key {#choosing-an-ordering-key} @@ -199,40 +199,40 @@ Partitioning in ClickHouse has similar applications as in BigQuery but with some - **Data management** - In ClickHouse, users should principally consider partitioning to be a data management feature, not a query optimization technique. By separating data logically based on a key, each partition can be operated on independently e.g. deleted. This allows users to move partitions, and thus subsets, between [storage tiers](/integrations/s3#storage-tiers) efficiently on time or [expire data/efficiently delete from the cluster](/sql-reference/statements/alter/partition). In example, below we remove posts from 2008: -```sql -SELECT DISTINCT partition -FROM system.parts -WHERE `table` = 'posts' - -┌─partition─┐ -│ 2008 │ -│ 2009 │ -│ 2010 │ -│ 2011 │ -│ 2012 │ -│ 2013 │ -│ 2014 │ -│ 2015 │ -│ 2016 │ -│ 2017 │ -│ 2018 │ -│ 2019 │ -│ 2020 │ -│ 2021 │ -│ 2022 │ -│ 2023 │ -│ 2024 │ -└───────────┘ - -17 rows in set. Elapsed: 0.002 sec. - -ALTER TABLE posts -(DROP PARTITION '2008') - -Ok. - -0 rows in set. Elapsed: 0.103 sec. -``` + ```sql + SELECT DISTINCT partition + FROM system.parts + WHERE `table` = 'posts' + + ┌─partition─┐ + │ 2008 │ + │ 2009 │ + │ 2010 │ + │ 2011 │ + │ 2012 │ + │ 2013 │ + │ 2014 │ + │ 2015 │ + │ 2016 │ + │ 2017 │ + │ 2018 │ + │ 2019 │ + │ 2020 │ + │ 2021 │ + │ 2022 │ + │ 2023 │ + │ 2024 │ + └───────────┘ + + 17 rows in set. Elapsed: 0.002 sec. + + ALTER TABLE posts + (DROP PARTITION '2008') + + Ok. + + 0 rows in set. Elapsed: 0.103 sec. + ``` - **Query optimization** - While partitions can assist with query performance, this depends heavily on the access patterns. If queries target only a few partitions (ideally one), performance can potentially improve. This is only typically useful if the partitioning key is not in the primary key and you are filtering by it. However, queries that need to cover many partitions may perform worse than if no partitioning is used (as there may possibly be more parts as a result of partitioning). The benefit of targeting a single partition will be even less pronounced to non-existence if the partitioning key is already an early entry in the primary key. Partitioning can also be used to [optimize `GROUP BY` queries](/engines/table-engines/mergetree-family/custom-partitioning-key#group-by-optimisation-using-partition-key) if values in each partition are unique. However, in general, users should ensure the primary key is optimized and only consider partitioning as a query optimization technique in exceptional cases where access patterns access a specific predictable subset of the day, e.g., partitioning by day, with most queries in the last day. @@ -249,7 +249,7 @@ Important: Ensure your partitioning key expression does not result in a high car ClickHouse's concept of projections allows users to specify multiple `ORDER BY` clauses for a table. In [ClickHouse data modeling](/data-modeling/schema-design), we explore how materialized views can be used -in ClickHouse to pre-compute aggregations, transform rows, and optimize queries +in ClickHouse to pre-compute aggregations, transform rows, and optimize queries for different access patterns. For the latter, we [provided an example](/materialized-view/incremental-materialized-view#lookup-table) where the materialized view sends rows to a target table with a different ordering key to the original table receiving inserts. @@ -270,7 +270,7 @@ Peak memory usage: 201.93 MiB. ``` This query requires all 90m rows to be scanned (albeit quickly) as the `UserId` -is not the ordering key. Previously, we solved this using a materialized view +is not the ordering key. Previously, we solved this using a materialized view acting as a lookup for the `PostId`. The same problem can be solved with a projection. The command below adds a projection with `ORDER BY user_id`. @@ -282,8 +282,8 @@ SELECT * ORDER BY UserId ALTER TABLE comments MATERIALIZE PROJECTION comments_user_id ``` -Note that we have to first create the projection and then materialize it. -This latter command causes the data to be stored twice on disk in two different +Note that we have to first create the projection and then materialize it. +This latter command causes the data to be stored twice on disk in two different orders. The projection can also be defined when the data is created, as shown below, and will be automatically maintained as data is inserted. @@ -309,7 +309,7 @@ ENGINE = MergeTree ORDER BY PostId ``` -If the projection is created via an `ALTER` command, the creation is asynchronous +If the projection is created via an `ALTER` command, the creation is asynchronous when the `MATERIALIZE PROJECTION` command is issued. Users can confirm the progress of this operation with the following query, waiting for `is_done=1`. @@ -323,68 +323,68 @@ WHERE (`table` = 'comments') AND (command LIKE '%MATERIALIZE%') ┌─parts_to_do─┬─is_done─┬─latest_fail_reason─┐ 1. │ 1 │ 0 │ │ - └─────────────┴─────────┴────────────────────┘ + └─────────────┴─────────┴────────────────────┘ -1 row in set. Elapsed: 0.003 sec. -``` + 1 row in set. Elapsed: 0.003 sec. + ``` -If we repeat the above query, we can see performance has improved significantly -at the expense of additional storage. + If we repeat the above query, we can see performance has improved significantly + at the expense of additional storage. -```sql -SELECT avg(Score) -FROM comments -WHERE UserId = 8592047 + ```sql + SELECT avg(Score) + FROM comments + WHERE UserId = 8592047 - ┌──────────avg(Score)─┐ + ┌──────────avg(Score)─┐ 1. │ 0.18181818181818182 │ - └─────────────────────┘ ---highlight-next-line -1 row in set. Elapsed: 0.008 sec. Processed 16.36 thousand rows, 98.17 KB (2.15 million rows/s., 12.92 MB/s.) -Peak memory usage: 4.06 MiB. -``` + └─────────────────────┘ + --highlight-next-line + 1 row in set. Elapsed: 0.008 sec. Processed 16.36 thousand rows, 98.17 KB (2.15 million rows/s., 12.92 MB/s.) + Peak memory usage: 4.06 MiB. + ``` -With an [`EXPLAIN` command](/sql-reference/statements/explain), we also confirm the projection was used to serve this query: + With an [`EXPLAIN` command](/sql-reference/statements/explain), we also confirm the projection was used to serve this query: -```sql -EXPLAIN indexes = 1 -SELECT avg(Score) -FROM comments -WHERE UserId = 8592047 + ```sql + EXPLAIN indexes = 1 + SELECT avg(Score) + FROM comments + WHERE UserId = 8592047 ┌─explain─────────────────────────────────────────────┐ - 1. │ Expression ((Projection + Before ORDER BY)) │ - 2. │ Aggregating │ - 3. │ Filter │ - 4. │ ReadFromMergeTree (comments_user_id) │ - 5. │ Indexes: │ - 6. │ PrimaryKey │ - 7. │ Keys: │ - 8. │ UserId │ - 9. │ Condition: (UserId in [8592047, 8592047]) │ + 1. │ Expression ((Projection + Before ORDER BY)) │ + 2. │ Aggregating │ + 3. │ Filter │ + 4. │ ReadFromMergeTree (comments_user_id) │ + 5. │ Indexes: │ + 6. │ PrimaryKey │ + 7. │ Keys: │ + 8. │ UserId │ + 9. │ Condition: (UserId in [8592047, 8592047]) │ 10. │ Parts: 2/2 │ 11. │ Granules: 2/11360 │ └─────────────────────────────────────────────────────┘ -11 rows in set. Elapsed: 0.004 sec. -``` + 11 rows in set. Elapsed: 0.004 sec. + ``` ### When to use projections {#when-to-use-projections} -Projections are an appealing feature for new users as they are automatically +Projections are an appealing feature for new users as they are automatically maintained as data is inserted. Furthermore, queries can just be sent to a single table where the projections are exploited where possible to speed up the response time. Projections -This is in contrast to materialized views, where the user has to select the +This is in contrast to materialized views, where the user has to select the appropriate optimized target table or rewrite their query, depending on the filters. -This places greater emphasis on user applications and increases client-side +This places greater emphasis on user applications and increases client-side complexity. -Despite these advantages, projections come with some inherent limitations which -users should be aware of and thus should be deployed sparingly. For further +Despite these advantages, projections come with some inherent limitations which +users should be aware of and thus should be deployed sparingly. For further details see ["materialized views versus projections"](/managing-data/materialized-views-versus-projections) We recommend using projections when: @@ -421,44 +421,41 @@ LIMIT 5 3. │ anon │ 19814224 │ 4. │ Tim │ 19028260 │ 5. │ John │ 17638812 │ - └──────────────────┴─────────────┘ + └──────────────────┴─────────────┘ -5 rows in set. Elapsed: 0.076 sec. Processed 24.35 million rows, 140.21 MB (320.82 million rows/s., 1.85 GB/s.) -Peak memory usage: 323.37 MiB. -``` + 5 rows in set. Elapsed: 0.076 sec. Processed 24.35 million rows, 140.21 MB (320.82 million rows/s., 1.85 GB/s.) + Peak memory usage: 323.37 MiB. + ``` -**Which tags receive the most views:** + **Which tags receive the most views:** -_BigQuery_ + _BigQuery_ -
    + BigQuery 1 -BigQuery 1 + _ClickHouse_ -_ClickHouse_ - -```sql --- ClickHouse -SELECT + ```sql + -- ClickHouse + SELECT arrayJoin(arrayFilter(t -> (t != ''), splitByChar('|', Tags))) AS tags, sum(ViewCount) AS views -FROM stackoverflow.posts -GROUP BY tags -ORDER BY views DESC -LIMIT 5 + FROM stackoverflow.posts + GROUP BY tags + ORDER BY views DESC + LIMIT 5 - - ┌─tags───────┬──────views─┐ + ┌─tags───────┬──────views─┐ 1. │ javascript │ 8190916894 │ 2. │ python │ 8175132834 │ 3. │ java │ 7258379211 │ 4. │ c# │ 5476932513 │ 5. │ android │ 4258320338 │ - └────────────┴────────────┘ + └────────────┴────────────┘ -5 rows in set. Elapsed: 0.318 sec. Processed 59.82 million rows, 1.45 GB (188.01 million rows/s., 4.54 GB/s.) -Peak memory usage: 567.41 MiB. -``` + 5 rows in set. Elapsed: 0.318 sec. Processed 59.82 million rows, 1.45 GB (188.01 million rows/s., 4.54 GB/s.) + Peak memory usage: 567.41 MiB. + ``` ## Aggregate functions {#aggregate-functions} @@ -484,7 +481,6 @@ GROUP BY Year ORDER BY Year ASC FORMAT Vertical - Row 1: ────── Year: 2008 diff --git a/docs/migrations/postgres/appendix.md b/docs/migrations/postgres/appendix.md index 6e4b762cfbd..08142a7aa9c 100644 --- a/docs/migrations/postgres/appendix.md +++ b/docs/migrations/postgres/appendix.md @@ -5,7 +5,6 @@ keywords: ['postgres', 'postgresql', 'data types', 'types'] description: 'Additional information relative to migrating from PostgreSQL' --- - import postgresReplicas from '@site/static/images/integrations/data-ingestion/dbms/postgres-replicas.png'; import Image from '@theme/IdealImage'; @@ -94,12 +93,10 @@ This can be achieved in several ways (in order of preference): 1. **Sync replicas manually** - If you write to one replica and read from another, you can use issue `SYSTEM SYNC REPLICA LIGHTWEIGHT` prior to reading. 1. **Enable sequential consistency** - via the query setting [`select_sequential_consistency = 1`](/operations/settings/settings#select_sequential_consistency). In OSS, the setting `insert_quorum = 'auto'` must also be specified. -
    - -See [here](/cloud/reference/shared-merge-tree#consistency) for further details on enabling these settings. + See [here](/cloud/reference/shared-merge-tree#consistency) for further details on enabling these settings. -> Use of sequential consistency will place a greater load on ClickHouse Keeper. The result can -mean slower inserts and reads. SharedMergeTree, used in ClickHouse Cloud as the main table engine, sequential consistency [incurs less overhead and will scale better](/cloud/reference/shared-merge-tree#consistency). OSS users should use this approach cautiously and measure Keeper load. + > Use of sequential consistency will place a greater load on ClickHouse Keeper. The result can + mean slower inserts and reads. SharedMergeTree, used in ClickHouse Cloud as the main table engine, sequential consistency [incurs less overhead and will scale better](/cloud/reference/shared-merge-tree#consistency). OSS users should use this approach cautiously and measure Keeper load. ## Transactional (ACID) support {#transactional-acid-support} diff --git a/docs/migrations/postgres/data-modeling-techniques.md b/docs/migrations/postgres/data-modeling-techniques.md index f864bd8fb3e..034cbecf026 100644 --- a/docs/migrations/postgres/data-modeling-techniques.md +++ b/docs/migrations/postgres/data-modeling-techniques.md @@ -26,17 +26,17 @@ To understand why using your OLTP primary key in ClickHouse is not appropriate, - Postgres primary keys are, by definition, unique per row. The use of [B-tree structures](/guides/best-practices/sparse-primary-indexes#an-index-design-for-massive-data-scales) allows the efficient lookup of single rows by this key. While ClickHouse can be optimized for the lookup of a single row value, analytics workloads will typically require the reading of a few columns but for many rows. Filters will more often need to identify **a subset of rows** on which an aggregation will be performed. - Memory and disk efficiency are paramount to the scale at which ClickHouse is often used. Data is written to ClickHouse tables in chunks known as parts, with rules applied for merging the parts in the background. In ClickHouse, each part has its own primary index. When parts are merged, the merged part's primary indexes are also merged. Unlike Postgres, these indexes are not built for each row. Instead, the primary index for a part has one index entry per group of rows - this technique is called **sparse indexing**. -- **Sparse indexing** is possible because ClickHouse stores the rows for a part on disk ordered by a specified key. Instead of directly locating single rows (like a B-Tree-based index), the sparse primary index allows it to quickly (via a binary search over index entries) identify groups of rows that could possibly match the query. The located groups of potentially matching rows are then, in parallel, streamed into the ClickHouse engine in order to find the matches. This index design allows for the primary index to be small (it completely fits into the main memory) whilst still significantly speeding up query execution times, especially for range queries that are typical in data analytics use cases. +- **Sparse indexing** is possible because ClickHouse stores the rows for a part on disk ordered by a specified key. Instead of directly locating single rows (like a B-Tree-based index), the sparse primary index allows it to quickly (via a binary search over index entries) identify groups of rows that could possibly match the query. The located groups of potentially matching rows are then, in parallel, streamed into the ClickHouse engine in order to find the matches. This index design allows for the primary index to be small (it completely fits into the main memory) whilst still significantly speeding up query execution times, especially for range queries that are typical in data analytics use cases. -For more details, we recommend this [in-depth guide](/guides/best-practices/sparse-primary-indexes). + For more details, we recommend this [in-depth guide](/guides/best-practices/sparse-primary-indexes). -PostgreSQL B-Tree Index + PostgreSQL B-Tree Index -PostgreSQL Sparse Index + PostgreSQL Sparse Index -The selected key in ClickHouse will determine not only the index but also the order in which data is written on disk. Because of this, it can dramatically impact compression levels, which can, in turn, affect query performance. An ordering key that causes the values of most columns to be written in a contiguous order will allow the selected compression algorithm (and codecs) to compress the data more effectively. + The selected key in ClickHouse will determine not only the index but also the order in which data is written on disk. Because of this, it can dramatically impact compression levels, which can, in turn, affect query performance. An ordering key that causes the values of most columns to be written in a contiguous order will allow the selected compression algorithm (and codecs) to compress the data more effectively. -> All columns in a table will be sorted based on the value of the specified ordering key, regardless of whether they are included in the key itself. For instance, if `CreationDate` is used as the key, the order of values in all other columns will correspond to the order of values in the `CreationDate` column. Multiple ordering keys can be specified - this will order with the same semantics as an `ORDER BY` clause in a `SELECT` query. + > All columns in a table will be sorted based on the value of the specified ordering key, regardless of whether they are included in the key itself. For instance, if `CreationDate` is used as the key, the order of values in all other columns will correspond to the order of values in the `CreationDate` column. Multiple ordering keys can be specified - this will order with the same semantics as an `ORDER BY` clause in a `SELECT` query. ### Choosing an ordering key {#choosing-an-ordering-key} @@ -77,40 +77,40 @@ Partitioning in ClickHouse has similar applications as in Postgres but with some - **Data management** - In ClickHouse, users should principally consider partitioning to be a data management feature, not a query optimization technique. By separating data logically based on a key, each partition can be operated on independently e.g. deleted. This allows users to move partitions, and thus subsets, between [storage tiers](/integrations/s3#storage-tiers) efficiently on time or [expire data/efficiently delete from the cluster](/sql-reference/statements/alter/partition). In example, below we remove posts from 2008. -```sql -SELECT DISTINCT partition -FROM system.parts -WHERE `table` = 'posts' - -┌─partition─┐ -│ 2008 │ -│ 2009 │ -│ 2010 │ -│ 2011 │ -│ 2012 │ -│ 2013 │ -│ 2014 │ -│ 2015 │ -│ 2016 │ -│ 2017 │ -│ 2018 │ -│ 2019 │ -│ 2020 │ -│ 2021 │ -│ 2022 │ -│ 2023 │ -│ 2024 │ -└───────────┘ - -17 rows in set. Elapsed: 0.002 sec. - -ALTER TABLE posts -(DROP PARTITION '2008') - -Ok. - -0 rows in set. Elapsed: 0.103 sec. -``` + ```sql + SELECT DISTINCT partition + FROM system.parts + WHERE `table` = 'posts' + + ┌─partition─┐ + │ 2008 │ + │ 2009 │ + │ 2010 │ + │ 2011 │ + │ 2012 │ + │ 2013 │ + │ 2014 │ + │ 2015 │ + │ 2016 │ + │ 2017 │ + │ 2018 │ + │ 2019 │ + │ 2020 │ + │ 2021 │ + │ 2022 │ + │ 2023 │ + │ 2024 │ + └───────────┘ + + 17 rows in set. Elapsed: 0.002 sec. + + ALTER TABLE posts + (DROP PARTITION '2008') + + Ok. + + 0 rows in set. Elapsed: 0.103 sec. + ``` - **Query optimization** - While partitions can assist with query performance, this depends heavily on the access patterns. If queries target only a few partitions (ideally one), performance can potentially improve. This is only typically useful if the partitioning key is not in the primary key and you are filtering by it. However, queries that need to cover many partitions may perform worse than if no partitioning is used (as there may possibly be more parts as a result of partitioning). The benefit of targeting a single partition will be even less pronounced to non-existence if the partitioning key is already an early entry in the primary key. Partitioning can also be used to [optimize GROUP BY queries](/engines/table-engines/mergetree-family/custom-partitioning-key#group-by-optimisation-using-partition-key) if values in each partition are unique. However, in general, users should ensure the primary key is optimized and only consider partitioning as a query optimization technique in exceptional cases where access patterns access a specific predictable subset of the day, e.g., partitioning by day, with most queries in the last day. @@ -141,30 +141,30 @@ WHERE UserId = 8592047 ┌──────────avg(Score)─┐ 1. │ 0.18181818181818182 │ - └─────────────────────┘ + └─────────────────────┘ -1 row in set. Elapsed: 0.040 sec. Processed 90.38 million rows, 361.59 MB (2.25 billion rows/s., 9.01 GB/s.) -Peak memory usage: 201.93 MiB. -``` + 1 row in set. Elapsed: 0.040 sec. Processed 90.38 million rows, 361.59 MB (2.25 billion rows/s., 9.01 GB/s.) + Peak memory usage: 201.93 MiB. + ``` -This query requires all 90m rows to be scanned (admittedly quickly) as the `UserId` is not the ordering key. -Previously, we solved this using a materialized view acting as a lookup for the `PostId`. The same problem can be solved -with a [projection](/data-modeling/projections). The command below adds a -projection for the `ORDER BY user_id`. + This query requires all 90m rows to be scanned (admittedly quickly) as the `UserId` is not the ordering key. + Previously, we solved this using a materialized view acting as a lookup for the `PostId`. The same problem can be solved + with a [projection](/data-modeling/projections). The command below adds a + projection for the `ORDER BY user_id`. -```sql -ALTER TABLE comments ADD PROJECTION comments_user_id ( -SELECT * ORDER BY UserId -) + ```sql + ALTER TABLE comments ADD PROJECTION comments_user_id ( + SELECT * ORDER BY UserId + ) -ALTER TABLE comments MATERIALIZE PROJECTION comments_user_id -``` + ALTER TABLE comments MATERIALIZE PROJECTION comments_user_id + ``` -Note that we have to first create the projection and then materialize it. This latter command causes the data to be stored twice on disk in two different orders. The projection can also be defined when the data is created, as shown below, and will be automatically maintained as data is inserted. + Note that we have to first create the projection and then materialize it. This latter command causes the data to be stored twice on disk in two different orders. The projection can also be defined when the data is created, as shown below, and will be automatically maintained as data is inserted. -```sql -CREATE TABLE comments -( + ```sql + CREATE TABLE comments + ( `Id` UInt32, `PostId` UInt32, `Score` UInt16, @@ -177,105 +177,105 @@ CREATE TABLE comments SELECT * ORDER BY UserId ) -) -ENGINE = MergeTree -ORDER BY PostId -``` + ) + ENGINE = MergeTree + ORDER BY PostId + ``` -If the projection is created via an `ALTER`, the creation is asynchronous when the `MATERIALIZE PROJECTION` command is issued. Users can confirm the progress of this operation with the following query, waiting for `is_done=1`. + If the projection is created via an `ALTER`, the creation is asynchronous when the `MATERIALIZE PROJECTION` command is issued. Users can confirm the progress of this operation with the following query, waiting for `is_done=1`. -```sql -SELECT + ```sql + SELECT parts_to_do, is_done, latest_fail_reason -FROM system.mutations -WHERE (`table` = 'comments') AND (command LIKE '%MATERIALIZE%') + FROM system.mutations + WHERE (`table` = 'comments') AND (command LIKE '%MATERIALIZE%') - ┌─parts_to_do─┬─is_done─┬─latest_fail_reason─┐ + ┌─parts_to_do─┬─is_done─┬─latest_fail_reason─┐ 1. │ 1 │ 0 │ │ - └─────────────┴─────────┴────────────────────┘ + └─────────────┴─────────┴────────────────────┘ -1 row in set. Elapsed: 0.003 sec. -``` + 1 row in set. Elapsed: 0.003 sec. + ``` -If we repeat the above query, we can see performance has improved significantly at the expense of additional storage. + If we repeat the above query, we can see performance has improved significantly at the expense of additional storage. -```sql -SELECT avg(Score) -FROM comments -WHERE UserId = 8592047 + ```sql + SELECT avg(Score) + FROM comments + WHERE UserId = 8592047 - ┌──────────avg(Score)─┐ + ┌──────────avg(Score)─┐ 1. │ 0.18181818181818182 │ - └─────────────────────┘ + └─────────────────────┘ -1 row in set. Elapsed: 0.008 sec. Processed 16.36 thousand rows, 98.17 KB (2.15 million rows/s., 12.92 MB/s.) -Peak memory usage: 4.06 MiB. -``` + 1 row in set. Elapsed: 0.008 sec. Processed 16.36 thousand rows, 98.17 KB (2.15 million rows/s., 12.92 MB/s.) + Peak memory usage: 4.06 MiB. + ``` -With an `EXPLAIN` command, we also confirm the projection was used to serve this query: + With an `EXPLAIN` command, we also confirm the projection was used to serve this query: -```sql -EXPLAIN indexes = 1 -SELECT avg(Score) -FROM comments -WHERE UserId = 8592047 + ```sql + EXPLAIN indexes = 1 + SELECT avg(Score) + FROM comments + WHERE UserId = 8592047 ┌─explain─────────────────────────────────────────────┐ - 1. │ Expression ((Projection + Before ORDER BY)) │ - 2. │ Aggregating │ - 3. │ Filter │ - 4. │ ReadFromMergeTree (comments_user_id) │ - 5. │ Indexes: │ - 6. │ PrimaryKey │ - 7. │ Keys: │ - 8. │ UserId │ - 9. │ Condition: (UserId in [8592047, 8592047]) │ + 1. │ Expression ((Projection + Before ORDER BY)) │ + 2. │ Aggregating │ + 3. │ Filter │ + 4. │ ReadFromMergeTree (comments_user_id) │ + 5. │ Indexes: │ + 6. │ PrimaryKey │ + 7. │ Keys: │ + 8. │ UserId │ + 9. │ Condition: (UserId in [8592047, 8592047]) │ 10. │ Parts: 2/2 │ 11. │ Granules: 2/11360 │ └─────────────────────────────────────────────────────┘ -11 rows in set. Elapsed: 0.004 sec. -``` + 11 rows in set. Elapsed: 0.004 sec. + ``` ### When to use projections {#when-to-use-projections} -Projections are an appealing feature for new users as they are automatically +Projections are an appealing feature for new users as they are automatically maintained as data is inserted. Furthermore, queries can just be sent to a single table where the projections are exploited where possible to speed up the response time. PostgreSQL projections in ClickHouse -This is in contrast to materialized views, where the user has to select the +This is in contrast to materialized views, where the user has to select the appropriate optimized target table or rewrite their query, depending on the filters. This places greater emphasis on user applications and increases client-side complexity. -Despite these advantages, projections come with some [inherent limitations](/data-modeling/projections#when-to-use-projections) +Despite these advantages, projections come with some [inherent limitations](/data-modeling/projections#when-to-use-projections) which users should be aware of and thus should be deployed sparingly. We recommend using projections when: -- A complete reordering of the data is required. While the expression in the - projection can, in theory, use a `GROUP BY,` materialized views are more - effective for maintaining aggregates. The query optimizer is also more likely - to exploit projections that use a simple reordering, i.e., `SELECT * ORDER BY x`. - Users can select a subset of columns in this expression to reduce storage footprint. -- Users are comfortable with the associated increase in storage footprint and - overhead of writing data twice. Test the impact on insertion speed and - [evaluate the storage overhead](/data-compression/compression-in-clickhouse). +- A complete reordering of the data is required. While the expression in the + projection can, in theory, use a `GROUP BY,` materialized views are more + effective for maintaining aggregates. The query optimizer is also more likely + to exploit projections that use a simple reordering, i.e., `SELECT * ORDER BY x`. + Users can select a subset of columns in this expression to reduce storage footprint. +- Users are comfortable with the associated increase in storage footprint and + overhead of writing data twice. Test the impact on insertion speed and + [evaluate the storage overhead](/data-compression/compression-in-clickhouse). -:::note -Since version 25.5, ClickHouse supports the virtual column `_part_offset` in -projections. This unlocks a more space-efficient way to store projections. + :::note + Since version 25.5, ClickHouse supports the virtual column `_part_offset` in + projections. This unlocks a more space-efficient way to store projections. -For more details see ["Projections"](/data-modeling/projections) -::: + For more details see ["Projections"](/data-modeling/projections) + ::: ## Denormalization {#denormalization} -Since Postgres is a relational database, its data model is heavily [normalized](https://en.wikipedia.org/wiki/Database_normalization), often involving hundreds of tables. In ClickHouse, denormalization can be beneficial at times to optimize JOIN performance. +Since Postgres is a relational database, its data model is heavily [normalized](https://en.wikipedia.org/wiki/Database_normalization), often involving hundreds of tables. In ClickHouse, denormalization can be beneficial at times to optimize JOIN performance. You can refer to this [guide](/data-modeling/denormalization) that shows the benefits of denormalizing the Stack Overflow dataset in ClickHouse. diff --git a/docs/migrations/postgres/dataset.md b/docs/migrations/postgres/dataset.md index 2574252e1da..9db7e59cb56 100644 --- a/docs/migrations/postgres/dataset.md +++ b/docs/migrations/postgres/dataset.md @@ -72,11 +72,11 @@ While small for ClickHouse, this dataset is substantial for Postgres. The above Refer to this [guide](/integrations/clickpipes/postgres) to set up ClickPipes for PostgreSQL. The guide is covering many different types of source Postgres instances. -With CDC approach using ClickPipes or PeerDB, each tables in the PostgreSQL database are automatically replicated in ClickHouse. +With CDC approach using ClickPipes or PeerDB, each tables in the PostgreSQL database are automatically replicated in ClickHouse. To handle updates and deletes in near real-time, ClickPipes maps Postgres tables to ClickHouse using [ReplacingMergeTree](/engines/table-engines/mergetree-family/replacingmergetree) engine, specifically designed to handle updates and deletes in ClickHouse. You can find more information on how the data gets replicated to ClickHouse using ClickPipes [here](/integrations/clickpipes/postgres/deduplication#how-does-data-get-replicated). It is important to note that replication using CDC creates duplicated rows in ClickHouse when replicating updates or deletes operations. [See techniques](/integrations/clickpipes/postgres/deduplication#deduplicate-using-final-keyword) using the [FINAL](https://clickhouse.com/docs/sql-reference/statements/select/from#final-modifier) modifier for handling those in ClickHouse. -Let's have a look on how the table `users` is created in ClickHouse using ClickPipes. +Let's have a look on how the table `users` is created in ClickHouse using ClickPipes. ```sql CREATE TABLE users @@ -102,7 +102,7 @@ PRIMARY KEY id ORDER BY id; ``` -Once set up, ClickPipes starts migrating all data from PostgreSQL to ClickHouse. Depending on the network and size of the deployments, this should take only a few minutes for the Stack Overflow dataset. +Once set up, ClickPipes starts migrating all data from PostgreSQL to ClickHouse. Depending on the network and size of the deployments, this should take only a few minutes for the Stack Overflow dataset. ### Manual bulk load with periodic updates {#initial-bulk-load-with-periodic-updates} @@ -111,70 +111,70 @@ Using a manual approach, the initial bulk load of the dataset can be achieved vi - **Table functions** - Using the [Postgres table function](/sql-reference/table-functions/postgresql) in ClickHouse to `SELECT` data from Postgres and `INSERT` it into a ClickHouse table. Relevant to bulk loads up to datasets of several hundred GB. - **Exports** - Exporting to intermediary formats such as CSV or SQL script file. These files can then be loaded into ClickHouse from either the client via the `INSERT FROM INFILE` clause or using object storage and their associated functions i.e. s3, gcs. -When loading data manually from PostgreSQL, you need to first create the tables in ClickHouse. Refer to this [Data Modeling documentation](/data-modeling/schema-design#establish-initial-schema) to that also uses the Stack Overflow dataset to optimize the table schema in ClickHouse. - -Data types between PostgreSQL and ClickHouse might differ. To establish the equivalent types for each of the table columns, we can use the `DESCRIBE` command with the [Postgres table function](/sql-reference/table-functions/postgresql). The following command describe the table `posts` in PostgreSQL, modify it according to your environment: - -```sql title="Query" -DESCRIBE TABLE postgresql(':', 'postgres', 'posts', '', '') -SETTINGS describe_compact_output = 1 -``` - -For an overview of data type mapping between PostgreSQL and ClickHouse, refer to the [appendix documentation](/migrations/postgresql/appendix#data-type-mappings). - -The steps for optimizing the types for this schema are identical to if the data has been loaded from other sources e.g. Parquet on S3. Applying the process described in this [alternate guide using Parquet](/data-modeling/schema-design) results in the following schema: - -```sql title="Query" -CREATE TABLE stackoverflow.posts -( - `Id` Int32, - `PostTypeId` Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, 'WikiPlaceholder' = 7, 'PrivilegeWiki' = 8), - `AcceptedAnswerId` UInt32, - `CreationDate` DateTime, - `Score` Int32, - `ViewCount` UInt32, - `Body` String, - `OwnerUserId` Int32, - `OwnerDisplayName` String, - `LastEditorUserId` Int32, - `LastEditorDisplayName` String, - `LastEditDate` DateTime, - `LastActivityDate` DateTime, - `Title` String, - `Tags` String, - `AnswerCount` UInt16, - `CommentCount` UInt8, - `FavoriteCount` UInt8, - `ContentLicense`LowCardinality(String), - `ParentId` String, - `CommunityOwnedDate` DateTime, - `ClosedDate` DateTime -) -ENGINE = MergeTree -ORDER BY tuple() -COMMENT 'Optimized types' -``` - -We can populate this with a simple `INSERT INTO SELECT`, reading the data from PostgresSQL and inserting into ClickHouse: - -```sql title="Query" -INSERT INTO stackoverflow.posts SELECT * FROM postgresql(':', 'postgres', 'posts', '', '') -0 rows in set. Elapsed: 146.471 sec. Processed 59.82 million rows, 83.82 GB (408.40 thousand rows/s., 572.25 MB/s.) -``` - -Incremental loads can, in turn, be scheduled. If the Postgres table only receives inserts and an incrementing id or timestamp exists, users can use the above table function approach to load increments, i.e. a `WHERE` clause can be applied to the `SELECT`. This approach may also be used to support updates if these are guaranteed to update the same column. Supporting deletes will, however, require a complete reload, which may be difficult to achieve as the table grows. - -We demonstrate an initial load and incremental load using the `CreationDate` (we assume this gets updated if rows are updated).. - -```sql --- initial load -INSERT INTO stackoverflow.posts SELECT * FROM postgresql('', 'postgres', 'posts', 'postgres', '', 'postgres', 'posts', 'postgres', ' ( SELECT (max(CreationDate) FROM stackoverflow.posts) -``` - -> ClickHouse will push down simple `WHERE` clauses such as `=`, `!=`, `>`,`>=`, `<`, `<=`, and IN to the PostgreSQL server. Incremental loads can thus be made more efficient by ensuring an index exists on columns used to identify the change set. - -> A possible method to detect UPDATE operations when using query replication is using the [`XMIN` system column](https://www.postgresql.org/docs/9.1/ddl-system-columns.html) (transaction IDs) as a watermark - a change in this column is indicative of a change and therefore can be applied to the destination table. Users employing this approach should be aware that `XMIN` values can wrap around and comparisons require a full table scan, making tracking changes more complex. - -[Click here for Part 2](./rewriting-queries.md) + When loading data manually from PostgreSQL, you need to first create the tables in ClickHouse. Refer to this [Data Modeling documentation](/data-modeling/schema-design#establish-initial-schema) to that also uses the Stack Overflow dataset to optimize the table schema in ClickHouse. + + Data types between PostgreSQL and ClickHouse might differ. To establish the equivalent types for each of the table columns, we can use the `DESCRIBE` command with the [Postgres table function](/sql-reference/table-functions/postgresql). The following command describe the table `posts` in PostgreSQL, modify it according to your environment: + + ```sql title="Query" + DESCRIBE TABLE postgresql(':', 'postgres', 'posts', '', '') + SETTINGS describe_compact_output = 1 + ``` + + For an overview of data type mapping between PostgreSQL and ClickHouse, refer to the [appendix documentation](/migrations/postgresql/appendix#data-type-mappings). + + The steps for optimizing the types for this schema are identical to if the data has been loaded from other sources e.g. Parquet on S3. Applying the process described in this [alternate guide using Parquet](/data-modeling/schema-design) results in the following schema: + + ```sql title="Query" + CREATE TABLE stackoverflow.posts + ( + `Id` Int32, + `PostTypeId` Enum('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, 'WikiPlaceholder' = 7, 'PrivilegeWiki' = 8), + `AcceptedAnswerId` UInt32, + `CreationDate` DateTime, + `Score` Int32, + `ViewCount` UInt32, + `Body` String, + `OwnerUserId` Int32, + `OwnerDisplayName` String, + `LastEditorUserId` Int32, + `LastEditorDisplayName` String, + `LastEditDate` DateTime, + `LastActivityDate` DateTime, + `Title` String, + `Tags` String, + `AnswerCount` UInt16, + `CommentCount` UInt8, + `FavoriteCount` UInt8, + `ContentLicense`LowCardinality(String), + `ParentId` String, + `CommunityOwnedDate` DateTime, + `ClosedDate` DateTime + ) + ENGINE = MergeTree + ORDER BY tuple() + COMMENT 'Optimized types' + ``` + + We can populate this with a simple `INSERT INTO SELECT`, reading the data from PostgresSQL and inserting into ClickHouse: + + ```sql title="Query" + INSERT INTO stackoverflow.posts SELECT * FROM postgresql(':', 'postgres', 'posts', '', '') + 0 rows in set. Elapsed: 146.471 sec. Processed 59.82 million rows, 83.82 GB (408.40 thousand rows/s., 572.25 MB/s.) + ``` + + Incremental loads can, in turn, be scheduled. If the Postgres table only receives inserts and an incrementing id or timestamp exists, users can use the above table function approach to load increments, i.e. a `WHERE` clause can be applied to the `SELECT`. This approach may also be used to support updates if these are guaranteed to update the same column. Supporting deletes will, however, require a complete reload, which may be difficult to achieve as the table grows. + + We demonstrate an initial load and incremental load using the `CreationDate` (we assume this gets updated if rows are updated).. + + ```sql + -- initial load + INSERT INTO stackoverflow.posts SELECT * FROM postgresql('', 'postgres', 'posts', 'postgres', '', 'postgres', 'posts', 'postgres', ' ( SELECT (max(CreationDate) FROM stackoverflow.posts) + ``` + + > ClickHouse will push down simple `WHERE` clauses such as `=`, `!=`, `>`,`>=`, `<`, `<=`, and IN to the PostgreSQL server. Incremental loads can thus be made more efficient by ensuring an index exists on columns used to identify the change set. + + > A possible method to detect UPDATE operations when using query replication is using the [`XMIN` system column](https://www.postgresql.org/docs/9.1/ddl-system-columns.html) (transaction IDs) as a watermark - a change in this column is indicative of a change and therefore can be applied to the destination table. Users employing this approach should be aware that `XMIN` values can wrap around and comparisons require a full table scan, making tracking changes more complex. + + [Click here for Part 2](./rewriting-queries.md) diff --git a/docs/migrations/postgres/index.md b/docs/migrations/postgres/index.md index 0c3f185618e..e4052fd1ab9 100644 --- a/docs/migrations/postgres/index.md +++ b/docs/migrations/postgres/index.md @@ -14,4 +14,3 @@ description: 'Landing page for the PostgreSQL migrations section' |[Rewriting PostgreSQL Queries](/migrations/postgresql/rewriting-queries)|Part 2 of a guide on migrating from PostgreSQL to ClickHouse. Using a practical example, it demonstrates how to efficiently carry out the migration with a real-time replication (CDC) approach. Many of the concepts covered are also applicable to manual bulk data transfers from PostgreSQL to ClickHouse.| |[Data modeling techniques](/migrations/postgresql/data-modeling-techniques)|Part 3 of a guide on migrating from PostgreSQL to ClickHouse. Using a practical example, it demonstrates how to model data in ClickHouse if migrating from PostgreSQL.| |[Appendix](/migrations/postgresql/appendix)|Additional information relevant to migrating from PostgreSQL| - diff --git a/docs/migrations/postgres/overview.md b/docs/migrations/postgres/overview.md index ca1d195b914..1f8d3b8b32d 100644 --- a/docs/migrations/postgres/overview.md +++ b/docs/migrations/postgres/overview.md @@ -25,7 +25,7 @@ Below section describes the two main strategies for migration: **Real-Time CDC** ### Real-time replication (CDC) {#real-time-replication-cdc} -Change Data Capture (CDC) is the process by which tables are kept in sync between two databases. It is the most efficient approach for most migration from PostgreSQL, but yet more complex as it handles insert, updates and deletes from PostgreSQL to ClickHouse in near real-time. It is ideal for use cases where real-time analytics are important. +Change Data Capture (CDC) is the process by which tables are kept in sync between two databases. It is the most efficient approach for most migration from PostgreSQL, but yet more complex as it handles insert, updates and deletes from PostgreSQL to ClickHouse in near real-time. It is ideal for use cases where real-time analytics are important. Real-time Change Data Capture (CDC) can be implemented in ClickHouse using [ClickPipes](/integrations/clickpipes/postgres/deduplication), if you're using ClickHouse Cloud, or [PeerDB](https://github.com/PeerDB-io/peerdb) in case you're running ClickHouse on-prem. Those solutions handles the complexities of real-time data synchronization, including initial load, by capturing inserts, updates, and deletes from PostgreSQL and replicating them in ClickHouse. This approach ensures that the data in ClickHouse is always fresh and accurate without requiring manual intervention. diff --git a/docs/migrations/postgres/rewriting-queries.md b/docs/migrations/postgres/rewriting-queries.md index 451d1b37d9a..0dbf0126144 100644 --- a/docs/migrations/postgres/rewriting-queries.md +++ b/docs/migrations/postgres/rewriting-queries.md @@ -11,7 +11,7 @@ Most SQL queries from your PostgreSQL setup should run in ClickHouse without mod ## Deduplication using CDC {#deduplication-cdc} -When using real-time replication with CDC, keep in mind that updates and deletes may result in duplicate rows. To manage this, you can use techniques involving Views and Refreshable Materialized Views. +When using real-time replication with CDC, keep in mind that updates and deletes may result in duplicate rows. To manage this, you can use techniques involving Views and Refreshable Materialized Views. Refer to this [guide](/integrations/clickpipes/postgres/deduplication#query-like-with-postgres) to learn how to migrate your application from PostgreSQL to ClickHouse with minimal friction when migrating using real-time replication with CDC. @@ -21,7 +21,7 @@ While this is possible to migrate with minimum query rewriting, it is recommende The examples here covers common query patterns and show how to optimize them with ClickHouse. They use the full [Stack Overflow dataset](/getting-started/example-datasets/stackoverflow) (up to April 2024) on equivalent resources in PostgreSQL and ClickHouse (8 cores, 32GiB RAM). -> For simplicity, the queries below omit the use of techniques to deduplicate the data. +> For simplicity, the queries below omit the use of techniques to deduplicate the data. > Counts here will slightly differ as the Postgres data only contains rows which satisfy the referential integrity of the foreign keys. ClickHouse imposes no such constraints and thus has the full dataset e.g. inc. anon users. diff --git a/docs/native-protocol/basics.md b/docs/native-protocol/basics.md index 7f77395a518..b5bb7a01d26 100644 --- a/docs/native-protocol/basics.md +++ b/docs/native-protocol/basics.md @@ -39,76 +39,60 @@ Validate length to prevent OOM: - ```go s := "Hello, world!" - // Writing string length as uvarint. buf := make([]byte, binary.MaxVarintLen64) n := binary.PutUvarint(buf, uint64(len(s))) buf = buf[:n] - // Writing string value. buf = append(buf, s...) ``` - - ```go r := bytes.NewReader([]byte{ - 0xd, 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x2c, - 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21, +0xd, 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x2c, +0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21, }) - // Read length. n, err := binary.ReadUvarint(r) if err != nil { - panic(err) +panic(err) } - // Check n to prevent OOM or runtime exception in make(). const maxSize = 1024 * 1024 * 10 // 10 MB if n > maxSize || n < 0 { - panic("invalid n") +panic("invalid n") } - buf := make([]byte, n) if _, err := io.ReadFull(r, buf); err != nil { - panic(err) +panic(err) } - fmt.Println(string(buf)) // Hello, world! ``` - - ```hexdump 00000000 0d 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 |.Hello, world!| ``` - - ```text DUhlbGxvLCB3b3JsZCE ``` - - ```go data := []byte{ - 0xd, 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x2c, - 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21, +0xd, 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x2c, +0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21, } ``` - @@ -133,18 +117,14 @@ fmt.Println(d) // 1000 - ```hexdump 00000000 e8 03 00 00 00 00 00 00 |........| ``` - - ```text 6AMAAAAAAAA ``` - diff --git a/docs/native-protocol/client.md b/docs/native-protocol/client.md index 3079c4debe6..5fe29e134f4 100644 --- a/docs/native-protocol/client.md +++ b/docs/native-protocol/client.md @@ -83,7 +83,6 @@ password as defaults. | trace_state | String | Tracing state | | trace_flags | Byte | Tracing flags | - ### Settings {#settings} | field | type | value | description | @@ -102,7 +101,6 @@ Encoded as list, blank key and value denotes end of list. | 1 | WithMergeableState | Until mergeable state | | 2 | Complete | Until full completeness (should be default) | - ## Data {#data} | field | type | description | @@ -127,4 +125,3 @@ No packet body. Server should cancel query. ## Ping {#ping} No packet body. Server should [respond with pong](./server.md#pong). - diff --git a/docs/native-protocol/hash.md b/docs/native-protocol/hash.md index 3de9ac70315..5a07b94bef9 100644 --- a/docs/native-protocol/hash.md +++ b/docs/native-protocol/hash.md @@ -12,7 +12,7 @@ ClickHouse uses **one of the previous** versions of [CityHash from Google](https :::info CityHash has changed the algorithm after we have added it into ClickHouse. -CityHash documentation specifically notes that the user should not rely on +CityHash documentation specifically notes that the user should not rely on specific hash values and should not save it anywhere or use it as a sharding key. But as we exposed this function to the user, we had to fix the version of CityHash (to 1.0.2). And now we guarantee that the behaviour of CityHash functions available in SQL will not change. diff --git a/docs/native-protocol/server.md b/docs/native-protocol/server.md index ebeec5cdb59..f3503a425b0 100644 --- a/docs/native-protocol/server.md +++ b/docs/native-protocol/server.md @@ -41,7 +41,6 @@ Response to [client hello](./client.md#hello). | display_name | String | `Clickhouse` | Server name for UI | | version_patch | UVarInt | `3` | Server patch version | - ## Exception {#exception} Server exception during query processing. @@ -124,7 +123,6 @@ Encoded as **data block** of columns, but is never compressed. The `value` type is `UInt64` or `Int64`, depending on server revision. ::: - | column | type | |--------------|-----------------| | host_name | String | diff --git a/docs/tools-and-utilities/static-files-disk-uploader.md b/docs/tools-and-utilities/static-files-disk-uploader.md index 3de2afe1cd1..b5ae834206b 100644 --- a/docs/tools-and-utilities/static-files-disk-uploader.md +++ b/docs/tools-and-utilities/static-files-disk-uploader.md @@ -34,23 +34,19 @@ When using `clickhouse-static-files-disk-uploader`, you must obtain the metadata 1. Run the following query specifying your target table and database: -
    - -```sql -SELECT data_paths - FROM system.tables - WHERE name = 'mytable' AND database = 'default'; -``` + ```sql + SELECT data_paths + FROM system.tables + WHERE name = 'mytable' AND database = 'default'; + ``` 2. This should return the path to the data directory for the specified table: -
    - -```response -┌─data_paths────────────────────────────────────────────┐ -│ ['./store/bcc/bccc1cfd-d43d-43cf-a5b6-1cda8178f1ee/'] │ -└───────────────────────────────────────────────────────┘ -``` + ```response + ┌─data_paths────────────────────────────────────────────┐ + │ ['./store/bcc/bccc1cfd-d43d-43cf-a5b6-1cda8178f1ee/'] │ + └───────────────────────────────────────────────────────┘ + ``` ## Output table metadata directory to the local filesystem {#output-table-metadata-directory-to-the-local-filesystem} diff --git a/docs/tutorial.md b/docs/tutorial.md index 7868c566297..eea3e228413 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -12,513 +12,446 @@ show_related_blogs: true ## Overview {#overview} -Learn how to ingest and query data in ClickHouse using the New York City taxi example dataset. +Learn how to ingest and query data in ClickHouse using the New York City taxi example dataset. ### Prerequisites {#prerequisites} You need access to a running ClickHouse service to complete this tutorial. For instructions, see the [Quick Start](/get-started/quick-start) guide. - ## Create a new table {#create-a-new-table} - The New York City taxi dataset contains details about millions of taxi rides, with columns including tip amount, tolls, payment type, and more. Create a table to store this data. - 1. Connect to the SQL console: - For ClickHouse Cloud, select a service from the dropdown menu and then select **SQL Console** from the left navigation menu. - For self-managed ClickHouse, connect to the SQL console at `https://_hostname_:8443/play`. Check with your ClickHouse administrator for the details. - - 2. Create the following `trips` table in the `default` database: - ```sql - CREATE TABLE trips - ( - `trip_id` UInt32, - `vendor_id` Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, 'CMT' = 5, 'VTS' = 6, 'DDS' = 7, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14, '' = 15), - `pickup_date` Date, - `pickup_datetime` DateTime, - `dropoff_date` Date, - `dropoff_datetime` DateTime, - `store_and_fwd_flag` UInt8, - `rate_code_id` UInt8, - `pickup_longitude` Float64, - `pickup_latitude` Float64, - `dropoff_longitude` Float64, - `dropoff_latitude` Float64, - `passenger_count` UInt8, - `trip_distance` Float64, - `fare_amount` Float32, - `extra` Float32, - `mta_tax` Float32, - `tip_amount` Float32, - `tolls_amount` Float32, - `ehail_fee` Float32, - `improvement_surcharge` Float32, - `total_amount` Float32, - `payment_type` Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), - `trip_type` UInt8, - `pickup` FixedString(25), - `dropoff` FixedString(25), - `cab_type` Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), - `pickup_nyct2010_gid` Int8, - `pickup_ctlabel` Float32, - `pickup_borocode` Int8, - `pickup_ct2010` String, - `pickup_boroct2010` String, - `pickup_cdeligibil` String, - `pickup_ntacode` FixedString(4), - `pickup_ntaname` String, - `pickup_puma` UInt16, - `dropoff_nyct2010_gid` UInt8, - `dropoff_ctlabel` Float32, - `dropoff_borocode` UInt8, - `dropoff_ct2010` String, - `dropoff_boroct2010` String, - `dropoff_cdeligibil` String, - `dropoff_ntacode` FixedString(4), - `dropoff_ntaname` String, - `dropoff_puma` UInt16 - ) - ENGINE = MergeTree - PARTITION BY toYYYYMM(pickup_date) - ORDER BY pickup_datetime; - ``` - +```sql +CREATE TABLE trips +( +`trip_id` UInt32, +`vendor_id` Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, 'CMT' = 5, 'VTS' = 6, 'DDS' = 7, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14, '' = 15), +`pickup_date` Date, +`pickup_datetime` DateTime, +`dropoff_date` Date, +`dropoff_datetime` DateTime, +`store_and_fwd_flag` UInt8, +`rate_code_id` UInt8, +`pickup_longitude` Float64, +`pickup_latitude` Float64, +`dropoff_longitude` Float64, +`dropoff_latitude` Float64, +`passenger_count` UInt8, +`trip_distance` Float64, +`fare_amount` Float32, +`extra` Float32, +`mta_tax` Float32, +`tip_amount` Float32, +`tolls_amount` Float32, +`ehail_fee` Float32, +`improvement_surcharge` Float32, +`total_amount` Float32, +`payment_type` Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), +`trip_type` UInt8, +`pickup` FixedString(25), +`dropoff` FixedString(25), +`cab_type` Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), +`pickup_nyct2010_gid` Int8, +`pickup_ctlabel` Float32, +`pickup_borocode` Int8, +`pickup_ct2010` String, +`pickup_boroct2010` String, +`pickup_cdeligibil` String, +`pickup_ntacode` FixedString(4), +`pickup_ntaname` String, +`pickup_puma` UInt16, +`dropoff_nyct2010_gid` UInt8, +`dropoff_ctlabel` Float32, +`dropoff_borocode` UInt8, +`dropoff_ct2010` String, +`dropoff_boroct2010` String, +`dropoff_cdeligibil` String, +`dropoff_ntacode` FixedString(4), +`dropoff_ntaname` String, +`dropoff_puma` UInt16 +) +ENGINE = MergeTree +PARTITION BY toYYYYMM(pickup_date) +ORDER BY pickup_datetime; +``` ## Add the dataset {#add-the-dataset} - Now that you've created a table, add the New York City taxi data from CSV files in S3. - 1. The following command inserts ~2,000,000 rows into your `trips` table from two different files in S3: `trips_1.tsv.gz` and `trips_2.tsv.gz`: - - ```sql - INSERT INTO trips - SELECT * FROM s3( - 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_{1..2}.gz', - 'TabSeparatedWithNames', " - `trip_id` UInt32, - `vendor_id` Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, 'CMT' = 5, 'VTS' = 6, 'DDS' = 7, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14, '' = 15), - `pickup_date` Date, - `pickup_datetime` DateTime, - `dropoff_date` Date, - `dropoff_datetime` DateTime, - `store_and_fwd_flag` UInt8, - `rate_code_id` UInt8, - `pickup_longitude` Float64, - `pickup_latitude` Float64, - `dropoff_longitude` Float64, - `dropoff_latitude` Float64, - `passenger_count` UInt8, - `trip_distance` Float64, - `fare_amount` Float32, - `extra` Float32, - `mta_tax` Float32, - `tip_amount` Float32, - `tolls_amount` Float32, - `ehail_fee` Float32, - `improvement_surcharge` Float32, - `total_amount` Float32, - `payment_type` Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), - `trip_type` UInt8, - `pickup` FixedString(25), - `dropoff` FixedString(25), - `cab_type` Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), - `pickup_nyct2010_gid` Int8, - `pickup_ctlabel` Float32, - `pickup_borocode` Int8, - `pickup_ct2010` String, - `pickup_boroct2010` String, - `pickup_cdeligibil` String, - `pickup_ntacode` FixedString(4), - `pickup_ntaname` String, - `pickup_puma` UInt16, - `dropoff_nyct2010_gid` UInt8, - `dropoff_ctlabel` Float32, - `dropoff_borocode` UInt8, - `dropoff_ct2010` String, - `dropoff_boroct2010` String, - `dropoff_cdeligibil` String, - `dropoff_ntacode` FixedString(4), - `dropoff_ntaname` String, - `dropoff_puma` UInt16 - ") SETTINGS input_format_try_infer_datetimes = 0 - ``` - +```sql +INSERT INTO trips +SELECT * FROM s3( +'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/trips_{1..2}.gz', +'TabSeparatedWithNames', " +`trip_id` UInt32, +`vendor_id` Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, 'CMT' = 5, 'VTS' = 6, 'DDS' = 7, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14, '' = 15), +`pickup_date` Date, +`pickup_datetime` DateTime, +`dropoff_date` Date, +`dropoff_datetime` DateTime, +`store_and_fwd_flag` UInt8, +`rate_code_id` UInt8, +`pickup_longitude` Float64, +`pickup_latitude` Float64, +`dropoff_longitude` Float64, +`dropoff_latitude` Float64, +`passenger_count` UInt8, +`trip_distance` Float64, +`fare_amount` Float32, +`extra` Float32, +`mta_tax` Float32, +`tip_amount` Float32, +`tolls_amount` Float32, +`ehail_fee` Float32, +`improvement_surcharge` Float32, +`total_amount` Float32, +`payment_type` Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), +`trip_type` UInt8, +`pickup` FixedString(25), +`dropoff` FixedString(25), +`cab_type` Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), +`pickup_nyct2010_gid` Int8, +`pickup_ctlabel` Float32, +`pickup_borocode` Int8, +`pickup_ct2010` String, +`pickup_boroct2010` String, +`pickup_cdeligibil` String, +`pickup_ntacode` FixedString(4), +`pickup_ntaname` String, +`pickup_puma` UInt16, +`dropoff_nyct2010_gid` UInt8, +`dropoff_ctlabel` Float32, +`dropoff_borocode` UInt8, +`dropoff_ct2010` String, +`dropoff_boroct2010` String, +`dropoff_cdeligibil` String, +`dropoff_ntacode` FixedString(4), +`dropoff_ntaname` String, +`dropoff_puma` UInt16 +") SETTINGS input_format_try_infer_datetimes = 0 +``` 2. Wait for the `INSERT` to finish. It might take a moment for the 150 MB of data to be downloaded. - - 3. When the insert is finished, verify it worked: - ```sql - SELECT count() FROM trips - ``` - - This query should return 1,999,657 rows. - +```sql +SELECT count() FROM trips +``` +This query should return 1,999,657 rows. ## Analyze the data {#analyze-the-data} - -Run some queries to analyze the data. Explore the following examples or try your own SQL query. - +Run some queries to analyze the data. Explore the following examples or try your own SQL query. - Calculate the average tip amount: - ```sql - SELECT round(avg(tip_amount), 2) FROM trips - ``` -
    - Expected output -

    - - ```response - ┌─round(avg(tip_amount), 2)─┐ - │ 1.68 │ - └───────────────────────────┘ - ``` - -

    -
    - +```sql +SELECT round(avg(tip_amount), 2) FROM trips +``` +
    +Expected output +

    +```response +┌─round(avg(tip_amount), 2)─┐ +│ 1.68 │ +└───────────────────────────┘ +``` +

    +
    - Calculate the average cost based on the number of passengers: - ```sql - SELECT - passenger_count, - ceil(avg(total_amount),2) AS average_total_amount - FROM trips - GROUP BY passenger_count - ``` - -
    - Expected output -

    - - The `passenger_count` ranges from 0 to 9: - - ```response - ┌─passenger_count─┬─average_total_amount─┐ - │ 0 │ 22.69 │ - │ 1 │ 15.97 │ - │ 2 │ 17.15 │ - │ 3 │ 16.76 │ - │ 4 │ 17.33 │ - │ 5 │ 16.35 │ - │ 6 │ 16.04 │ - │ 7 │ 59.8 │ - │ 8 │ 36.41 │ - │ 9 │ 9.81 │ - └─────────────────┴──────────────────────┘ - ``` - -

    -
    - +```sql +SELECT +passenger_count, +ceil(avg(total_amount),2) AS average_total_amount +FROM trips +GROUP BY passenger_count +``` +
    +Expected output +

    +The `passenger_count` ranges from 0 to 9: +```response +┌─passenger_count─┬─average_total_amount─┐ +│ 0 │ 22.69 │ +│ 1 │ 15.97 │ +│ 2 │ 17.15 │ +│ 3 │ 16.76 │ +│ 4 │ 17.33 │ +│ 5 │ 16.35 │ +│ 6 │ 16.04 │ +│ 7 │ 59.8 │ +│ 8 │ 36.41 │ +│ 9 │ 9.81 │ +└─────────────────┴──────────────────────┘ +``` +

    +
    - Calculate the daily number of pickups per neighborhood: - ```sql - SELECT - pickup_date, - pickup_ntaname, - SUM(1) AS number_of_trips - FROM trips - GROUP BY pickup_date, pickup_ntaname - ORDER BY pickup_date ASC - ``` - -
    - Expected output -

    - - ```response - ┌─pickup_date─┬─pickup_ntaname───────────────────────────────────────────┬─number_of_trips─┐ - │ 2015-07-01 │ Brooklyn Heights-Cobble Hill │ 13 │ - │ 2015-07-01 │ Old Astoria │ 5 │ - │ 2015-07-01 │ Flushing │ 1 │ - │ 2015-07-01 │ Yorkville │ 378 │ - │ 2015-07-01 │ Gramercy │ 344 │ - │ 2015-07-01 │ Fordham South │ 2 │ - │ 2015-07-01 │ SoHo-TriBeCa-Civic Center-Little Italy │ 621 │ - │ 2015-07-01 │ Park Slope-Gowanus │ 29 │ - │ 2015-07-01 │ Bushwick South │ 5 │ - ``` - -

    -
    - +```sql +SELECT +pickup_date, +pickup_ntaname, +SUM(1) AS number_of_trips +FROM trips +GROUP BY pickup_date, pickup_ntaname +ORDER BY pickup_date ASC +``` +
    +Expected output +

    +```response +┌─pickup_date─┬─pickup_ntaname───────────────────────────────────────────┬─number_of_trips─┐ +│ 2015-07-01 │ Brooklyn Heights-Cobble Hill │ 13 │ +│ 2015-07-01 │ Old Astoria │ 5 │ +│ 2015-07-01 │ Flushing │ 1 │ +│ 2015-07-01 │ Yorkville │ 378 │ +│ 2015-07-01 │ Gramercy │ 344 │ +│ 2015-07-01 │ Fordham South │ 2 │ +│ 2015-07-01 │ SoHo-TriBeCa-Civic Center-Little Italy │ 621 │ +│ 2015-07-01 │ Park Slope-Gowanus │ 29 │ +│ 2015-07-01 │ Bushwick South │ 5 │ +``` +

    +
    - Calculate the length of each trip in minutes, then group the results by trip length: - ```sql - SELECT - avg(tip_amount) AS avg_tip, - avg(fare_amount) AS avg_fare, - avg(passenger_count) AS avg_passenger, - count() AS count, - truncate(date_diff('second', pickup_datetime, dropoff_datetime)/60) as trip_minutes - FROM trips - WHERE trip_minutes > 0 - GROUP BY trip_minutes - ORDER BY trip_minutes DESC - ``` -
    - Expected output -

    - - ```response - ┌──────────────avg_tip─┬───────────avg_fare─┬──────avg_passenger─┬──count─┬─trip_minutes─┐ - │ 1.9600000381469727 │ 8 │ 1 │ 1 │ 27511 │ - │ 0 │ 12 │ 2 │ 1 │ 27500 │ - │ 0.542166673981895 │ 19.716666666666665 │ 1.9166666666666667 │ 60 │ 1439 │ - │ 0.902499997522682 │ 11.270625001192093 │ 1.95625 │ 160 │ 1438 │ - │ 0.9715789457909146 │ 13.646616541353383 │ 2.0526315789473686 │ 133 │ 1437 │ - │ 0.9682692398245518 │ 14.134615384615385 │ 2.076923076923077 │ 104 │ 1436 │ - │ 1.1022105210705808 │ 13.778947368421052 │ 2.042105263157895 │ 95 │ 1435 │ - ``` -

    -
    - - +```sql +SELECT +avg(tip_amount) AS avg_tip, +avg(fare_amount) AS avg_fare, +avg(passenger_count) AS avg_passenger, +count() AS count, +truncate(date_diff('second', pickup_datetime, dropoff_datetime)/60) as trip_minutes +FROM trips +WHERE trip_minutes > 0 +GROUP BY trip_minutes +ORDER BY trip_minutes DESC +``` +
    +Expected output +

    +```response +┌──────────────avg_tip─┬───────────avg_fare─┬──────avg_passenger─┬──count─┬─trip_minutes─┐ +│ 1.9600000381469727 │ 8 │ 1 │ 1 │ 27511 │ +│ 0 │ 12 │ 2 │ 1 │ 27500 │ +│ 0.542166673981895 │ 19.716666666666665 │ 1.9166666666666667 │ 60 │ 1439 │ +│ 0.902499997522682 │ 11.270625001192093 │ 1.95625 │ 160 │ 1438 │ +│ 0.9715789457909146 │ 13.646616541353383 │ 2.0526315789473686 │ 133 │ 1437 │ +│ 0.9682692398245518 │ 14.134615384615385 │ 2.076923076923077 │ 104 │ 1436 │ +│ 1.1022105210705808 │ 13.778947368421052 │ 2.042105263157895 │ 95 │ 1435 │ +``` +

    +
    - Show the number of pickups in each neighborhood broken down by hour of the day: - ```sql - SELECT - pickup_ntaname, - toHour(pickup_datetime) as pickup_hour, - SUM(1) AS pickups - FROM trips - WHERE pickup_ntaname != '' - GROUP BY pickup_ntaname, pickup_hour - ORDER BY pickup_ntaname, pickup_hour - ``` -
    - Expected output -

    - - ```response - ┌─pickup_ntaname───────────────────────────────────────────┬─pickup_hour─┬─pickups─┐ - │ Airport │ 0 │ 3509 │ - │ Airport │ 1 │ 1184 │ - │ Airport │ 2 │ 401 │ - │ Airport │ 3 │ 152 │ - │ Airport │ 4 │ 213 │ - │ Airport │ 5 │ 955 │ - │ Airport │ 6 │ 2161 │ - │ Airport │ 7 │ 3013 │ - │ Airport │ 8 │ 3601 │ - │ Airport │ 9 │ 3792 │ - │ Airport │ 10 │ 4546 │ - │ Airport │ 11 │ 4659 │ - │ Airport │ 12 │ 4621 │ - │ Airport │ 13 │ 5348 │ - │ Airport │ 14 │ 5889 │ - │ Airport │ 15 │ 6505 │ - │ Airport │ 16 │ 6119 │ - │ Airport │ 17 │ 6341 │ - │ Airport │ 18 │ 6173 │ - │ Airport │ 19 │ 6329 │ - │ Airport │ 20 │ 6271 │ - │ Airport │ 21 │ 6649 │ - │ Airport │ 22 │ 6356 │ - │ Airport │ 23 │ 6016 │ - │ Allerton-Pelham Gardens │ 4 │ 1 │ - │ Allerton-Pelham Gardens │ 6 │ 1 │ - │ Allerton-Pelham Gardens │ 7 │ 1 │ - │ Allerton-Pelham Gardens │ 9 │ 5 │ - │ Allerton-Pelham Gardens │ 10 │ 3 │ - │ Allerton-Pelham Gardens │ 15 │ 1 │ - │ Allerton-Pelham Gardens │ 20 │ 2 │ - │ Allerton-Pelham Gardens │ 23 │ 1 │ - │ Annadale-Huguenot-Prince's Bay-Eltingville │ 23 │ 1 │ - │ Arden Heights │ 11 │ 1 │ - ``` - -

    -
    - - +```sql +SELECT +pickup_ntaname, +toHour(pickup_datetime) as pickup_hour, +SUM(1) AS pickups +FROM trips +WHERE pickup_ntaname != '' +GROUP BY pickup_ntaname, pickup_hour +ORDER BY pickup_ntaname, pickup_hour +``` +
    +Expected output +

    +```response +┌─pickup_ntaname───────────────────────────────────────────┬─pickup_hour─┬─pickups─┐ +│ Airport │ 0 │ 3509 │ +│ Airport │ 1 │ 1184 │ +│ Airport │ 2 │ 401 │ +│ Airport │ 3 │ 152 │ +│ Airport │ 4 │ 213 │ +│ Airport │ 5 │ 955 │ +│ Airport │ 6 │ 2161 │ +│ Airport │ 7 │ 3013 │ +│ Airport │ 8 │ 3601 │ +│ Airport │ 9 │ 3792 │ +│ Airport │ 10 │ 4546 │ +│ Airport │ 11 │ 4659 │ +│ Airport │ 12 │ 4621 │ +│ Airport │ 13 │ 5348 │ +│ Airport │ 14 │ 5889 │ +│ Airport │ 15 │ 6505 │ +│ Airport │ 16 │ 6119 │ +│ Airport │ 17 │ 6341 │ +│ Airport │ 18 │ 6173 │ +│ Airport │ 19 │ 6329 │ +│ Airport │ 20 │ 6271 │ +│ Airport │ 21 │ 6649 │ +│ Airport │ 22 │ 6356 │ +│ Airport │ 23 │ 6016 │ +│ Allerton-Pelham Gardens │ 4 │ 1 │ +│ Allerton-Pelham Gardens │ 6 │ 1 │ +│ Allerton-Pelham Gardens │ 7 │ 1 │ +│ Allerton-Pelham Gardens │ 9 │ 5 │ +│ Allerton-Pelham Gardens │ 10 │ 3 │ +│ Allerton-Pelham Gardens │ 15 │ 1 │ +│ Allerton-Pelham Gardens │ 20 │ 2 │ +│ Allerton-Pelham Gardens │ 23 │ 1 │ +│ Annadale-Huguenot-Prince's Bay-Eltingville │ 23 │ 1 │ +│ Arden Heights │ 11 │ 1 │ +``` +

    +
    7. Retrieve rides to LaGuardia or JFK airports: - ```sql - SELECT - pickup_datetime, - dropoff_datetime, - total_amount, - pickup_nyct2010_gid, - dropoff_nyct2010_gid, - CASE - WHEN dropoff_nyct2010_gid = 138 THEN 'LGA' - WHEN dropoff_nyct2010_gid = 132 THEN 'JFK' - END AS airport_code, - EXTRACT(YEAR FROM pickup_datetime) AS year, - EXTRACT(DAY FROM pickup_datetime) AS day, - EXTRACT(HOUR FROM pickup_datetime) AS hour - FROM trips - WHERE dropoff_nyct2010_gid IN (132, 138) - ORDER BY pickup_datetime - ``` - -
    - Expected output -

    - - ```response - ┌─────pickup_datetime─┬────dropoff_datetime─┬─total_amount─┬─pickup_nyct2010_gid─┬─dropoff_nyct2010_gid─┬─airport_code─┬─year─┬─day─┬─hour─┐ - │ 2015-07-01 00:04:14 │ 2015-07-01 00:15:29 │ 13.3 │ -34 │ 132 │ JFK │ 2015 │ 1 │ 0 │ - │ 2015-07-01 00:09:42 │ 2015-07-01 00:12:55 │ 6.8 │ 50 │ 138 │ LGA │ 2015 │ 1 │ 0 │ - │ 2015-07-01 00:23:04 │ 2015-07-01 00:24:39 │ 4.8 │ -125 │ 132 │ JFK │ 2015 │ 1 │ 0 │ - │ 2015-07-01 00:27:51 │ 2015-07-01 00:39:02 │ 14.72 │ -101 │ 138 │ LGA │ 2015 │ 1 │ 0 │ - │ 2015-07-01 00:32:03 │ 2015-07-01 00:55:39 │ 39.34 │ 48 │ 138 │ LGA │ 2015 │ 1 │ 0 │ - │ 2015-07-01 00:34:12 │ 2015-07-01 00:40:48 │ 9.95 │ -93 │ 132 │ JFK │ 2015 │ 1 │ 0 │ - │ 2015-07-01 00:38:26 │ 2015-07-01 00:49:00 │ 13.3 │ -11 │ 138 │ LGA │ 2015 │ 1 │ 0 │ - │ 2015-07-01 00:41:48 │ 2015-07-01 00:44:45 │ 6.3 │ -94 │ 132 │ JFK │ 2015 │ 1 │ 0 │ - │ 2015-07-01 01:06:18 │ 2015-07-01 01:14:43 │ 11.76 │ 37 │ 132 │ JFK │ 2015 │ 1 │ 1 │ - ``` - -

    -
    - +```sql +SELECT +pickup_datetime, +dropoff_datetime, +total_amount, +pickup_nyct2010_gid, +dropoff_nyct2010_gid, +CASE +WHEN dropoff_nyct2010_gid = 138 THEN 'LGA' +WHEN dropoff_nyct2010_gid = 132 THEN 'JFK' +END AS airport_code, +EXTRACT(YEAR FROM pickup_datetime) AS year, +EXTRACT(DAY FROM pickup_datetime) AS day, +EXTRACT(HOUR FROM pickup_datetime) AS hour +FROM trips +WHERE dropoff_nyct2010_gid IN (132, 138) +ORDER BY pickup_datetime +``` +
    +Expected output +

    +```response +┌─────pickup_datetime─┬────dropoff_datetime─┬─total_amount─┬─pickup_nyct2010_gid─┬─dropoff_nyct2010_gid─┬─airport_code─┬─year─┬─day─┬─hour─┐ +│ 2015-07-01 00:04:14 │ 2015-07-01 00:15:29 │ 13.3 │ -34 │ 132 │ JFK │ 2015 │ 1 │ 0 │ +│ 2015-07-01 00:09:42 │ 2015-07-01 00:12:55 │ 6.8 │ 50 │ 138 │ LGA │ 2015 │ 1 │ 0 │ +│ 2015-07-01 00:23:04 │ 2015-07-01 00:24:39 │ 4.8 │ -125 │ 132 │ JFK │ 2015 │ 1 │ 0 │ +│ 2015-07-01 00:27:51 │ 2015-07-01 00:39:02 │ 14.72 │ -101 │ 138 │ LGA │ 2015 │ 1 │ 0 │ +│ 2015-07-01 00:32:03 │ 2015-07-01 00:55:39 │ 39.34 │ 48 │ 138 │ LGA │ 2015 │ 1 │ 0 │ +│ 2015-07-01 00:34:12 │ 2015-07-01 00:40:48 │ 9.95 │ -93 │ 132 │ JFK │ 2015 │ 1 │ 0 │ +│ 2015-07-01 00:38:26 │ 2015-07-01 00:49:00 │ 13.3 │ -11 │ 138 │ LGA │ 2015 │ 1 │ 0 │ +│ 2015-07-01 00:41:48 │ 2015-07-01 00:44:45 │ 6.3 │ -94 │ 132 │ JFK │ 2015 │ 1 │ 0 │ +│ 2015-07-01 01:06:18 │ 2015-07-01 01:14:43 │ 11.76 │ 37 │ 132 │ JFK │ 2015 │ 1 │ 1 │ +``` +

    +
    ## Create a dictionary {#create-a-dictionary} - -A dictionary is a mapping of key-value pairs stored in memory. For details, see [Dictionaries](/sql-reference/dictionaries/index.md) - +A dictionary is a mapping of key-value pairs stored in memory. For details, see [Dictionaries](/sql-reference/dictionaries/index.md) Create a dictionary associated with a table in your ClickHouse service. -The table and dictionary are based on a CSV file that contains a row for each neighborhood in New York City. - +The table and dictionary are based on a CSV file that contains a row for each neighborhood in New York City. The neighborhoods are mapped to the names of the five New York City boroughs (Bronx, Brooklyn, Manhattan, Queens and Staten Island), as well as Newark Airport (EWR). - Here's an excerpt from the CSV file you're using in table format. The `LocationID` column in the file maps to the `pickup_nyct2010_gid` and `dropoff_nyct2010_gid` columns in your `trips` table: - - | LocationID | Borough | Zone | service_zone | - | ----------- | ----------- | ----------- | ----------- | - | 1 | EWR | Newark Airport | EWR | - | 2 | Queens | Jamaica Bay | Boro Zone | - | 3 | Bronx | Allerton/Pelham Gardens | Boro Zone | - | 4 | Manhattan | Alphabet City | Yellow Zone | - | 5 | Staten Island | Arden Heights | Boro Zone | - - -1. Run the following SQL command, which creates a dictionary named `taxi_zone_dictionary` and populates the dictionary from the CSV file in S3. The URL for the file is `https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/taxi_zone_lookup.csv`. - ```sql - CREATE DICTIONARY taxi_zone_dictionary - ( - `LocationID` UInt16 DEFAULT 0, - `Borough` String, - `Zone` String, - `service_zone` String - ) - PRIMARY KEY LocationID - SOURCE(HTTP(URL 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/taxi_zone_lookup.csv' FORMAT 'CSVWithNames')) - LIFETIME(MIN 0 MAX 0) - LAYOUT(HASHED_ARRAY()) - ``` - - :::note - Setting `LIFETIME` to 0 disables automatic updates to avoid unnecessary traffic to our S3 bucket. In other cases, you might configure it differently. For details, see [Refreshing dictionary data using LIFETIME](/sql-reference/dictionaries#refreshing-dictionary-data-using-lifetime). - ::: - +| LocationID | Borough | Zone | service_zone | +| ----------- | ----------- | ----------- | ----------- | +| 1 | EWR | Newark Airport | EWR | +| 2 | Queens | Jamaica Bay | Boro Zone | +| 3 | Bronx | Allerton/Pelham Gardens | Boro Zone | +| 4 | Manhattan | Alphabet City | Yellow Zone | +| 5 | Staten Island | Arden Heights | Boro Zone | +1. Run the following SQL command, which creates a dictionary named `taxi_zone_dictionary` and populates the dictionary from the CSV file in S3. The URL for the file is `https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/taxi_zone_lookup.csv`. +```sql +CREATE DICTIONARY taxi_zone_dictionary +( +`LocationID` UInt16 DEFAULT 0, +`Borough` String, +`Zone` String, +`service_zone` String +) +PRIMARY KEY LocationID +SOURCE(HTTP(URL 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/taxi_zone_lookup.csv' FORMAT 'CSVWithNames')) +LIFETIME(MIN 0 MAX 0) +LAYOUT(HASHED_ARRAY()) +``` +:::note +Setting `LIFETIME` to 0 disables automatic updates to avoid unnecessary traffic to our S3 bucket. In other cases, you might configure it differently. For details, see [Refreshing dictionary data using LIFETIME](/sql-reference/dictionaries#refreshing-dictionary-data-using-lifetime). +::: 3. Verify it worked. The following should return 265 rows, or one row for each neighborhood: - ```sql - SELECT * FROM taxi_zone_dictionary - ``` - +```sql +SELECT * FROM taxi_zone_dictionary +``` 4. Use the `dictGet` function ([or its variations](./sql-reference/functions/ext-dict-functions.md)) to retrieve a value from a dictionary. You pass in the name of the dictionary, the value you want, and the key (which in our example is the `LocationID` column of `taxi_zone_dictionary`). - - For example, the following query returns the `Borough` whose `LocationID` is 132, which corresponds to JFK airport): - ```sql - SELECT dictGet('taxi_zone_dictionary', 'Borough', 132) - ``` - - JFK is in Queens. Notice the time to retrieve the value is essentially 0: - ```response - ┌─dictGet('taxi_zone_dictionary', 'Borough', 132)─┐ - │ Queens │ - └─────────────────────────────────────────────────┘ - - 1 rows in set. Elapsed: 0.004 sec. - ``` - +For example, the following query returns the `Borough` whose `LocationID` is 132, which corresponds to JFK airport): +```sql +SELECT dictGet('taxi_zone_dictionary', 'Borough', 132) +``` +JFK is in Queens. Notice the time to retrieve the value is essentially 0: +```response +┌─dictGet('taxi_zone_dictionary', 'Borough', 132)─┐ +│ Queens │ +└─────────────────────────────────────────────────┘ +1 rows in set. Elapsed: 0.004 sec. +``` 5. Use the `dictHas` function to see if a key is present in the dictionary. For example, the following query returns `1` (which is "true" in ClickHouse): - ```sql - SELECT dictHas('taxi_zone_dictionary', 132) - ``` - +```sql +SELECT dictHas('taxi_zone_dictionary', 132) +``` 6. The following query returns 0 because 4567 is not a value of `LocationID` in the dictionary: - ```sql - SELECT dictHas('taxi_zone_dictionary', 4567) - ``` - +```sql +SELECT dictHas('taxi_zone_dictionary', 4567) +``` 7. Use the `dictGet` function to retrieve a borough's name in a query. For example: - ```sql - SELECT - count(1) AS total, - dictGetOrDefault('taxi_zone_dictionary','Borough', toUInt64(pickup_nyct2010_gid), 'Unknown') AS borough_name - FROM trips - WHERE dropoff_nyct2010_gid = 132 OR dropoff_nyct2010_gid = 138 - GROUP BY borough_name - ORDER BY total DESC - ``` - - This query sums up the number of taxi rides per borough that end at either the LaGuardia or JFK airport. The result looks like the following, and notice there are quite a few trips where the pickup neighborhood is unknown: - ```response - ┌─total─┬─borough_name──┐ - │ 23683 │ Unknown │ - │ 7053 │ Manhattan │ - │ 6828 │ Brooklyn │ - │ 4458 │ Queens │ - │ 2670 │ Bronx │ - │ 554 │ Staten Island │ - │ 53 │ EWR │ - └───────┴───────────────┘ - - 7 rows in set. Elapsed: 0.019 sec. Processed 2.00 million rows, 4.00 MB (105.70 million rows/s., 211.40 MB/s.) - ``` - - +```sql +SELECT +count(1) AS total, +dictGetOrDefault('taxi_zone_dictionary','Borough', toUInt64(pickup_nyct2010_gid), 'Unknown') AS borough_name +FROM trips +WHERE dropoff_nyct2010_gid = 132 OR dropoff_nyct2010_gid = 138 +GROUP BY borough_name +ORDER BY total DESC +``` +This query sums up the number of taxi rides per borough that end at either the LaGuardia or JFK airport. The result looks like the following, and notice there are quite a few trips where the pickup neighborhood is unknown: +```response +┌─total─┬─borough_name──┐ +│ 23683 │ Unknown │ +│ 7053 │ Manhattan │ +│ 6828 │ Brooklyn │ +│ 4458 │ Queens │ +│ 2670 │ Bronx │ +│ 554 │ Staten Island │ +│ 53 │ EWR │ +└───────┴───────────────┘ +7 rows in set. Elapsed: 0.019 sec. Processed 2.00 million rows, 4.00 MB (105.70 million rows/s., 211.40 MB/s.) +``` ## Perform a join {#perform-a-join} - Write some queries that join the `taxi_zone_dictionary` with your `trips` table. - 1. Start with a simple `JOIN` that acts similarly to the previous airport query above: - ```sql - SELECT - count(1) AS total, - Borough - FROM trips - JOIN taxi_zone_dictionary ON toUInt64(trips.pickup_nyct2010_gid) = taxi_zone_dictionary.LocationID - WHERE dropoff_nyct2010_gid = 132 OR dropoff_nyct2010_gid = 138 - GROUP BY Borough - ORDER BY total DESC - ``` - - The response looks is identical to the `dictGet` query: - ```response - ┌─total─┬─Borough───────┐ - │ 7053 │ Manhattan │ - │ 6828 │ Brooklyn │ - │ 4458 │ Queens │ - │ 2670 │ Bronx │ - │ 554 │ Staten Island │ - │ 53 │ EWR │ - └───────┴───────────────┘ - - 6 rows in set. Elapsed: 0.034 sec. Processed 2.00 million rows, 4.00 MB (59.14 million rows/s., 118.29 MB/s.) - ``` - - :::note - Notice the output of the above `JOIN` query is the same as the query before it that used `dictGetOrDefault` (except that the `Unknown` values are not included). Behind the scenes, ClickHouse is actually calling the `dictGet` function for the `taxi_zone_dictionary` dictionary, but the `JOIN` syntax is more familiar for SQL developers. - ::: - +```sql +SELECT +count(1) AS total, +Borough +FROM trips +JOIN taxi_zone_dictionary ON toUInt64(trips.pickup_nyct2010_gid) = taxi_zone_dictionary.LocationID +WHERE dropoff_nyct2010_gid = 132 OR dropoff_nyct2010_gid = 138 +GROUP BY Borough +ORDER BY total DESC +``` +The response looks is identical to the `dictGet` query: +```response +┌─total─┬─Borough───────┐ +│ 7053 │ Manhattan │ +│ 6828 │ Brooklyn │ +│ 4458 │ Queens │ +│ 2670 │ Bronx │ +│ 554 │ Staten Island │ +│ 53 │ EWR │ +└───────┴───────────────┘ +6 rows in set. Elapsed: 0.034 sec. Processed 2.00 million rows, 4.00 MB (59.14 million rows/s., 118.29 MB/s.) +``` +:::note +Notice the output of the above `JOIN` query is the same as the query before it that used `dictGetOrDefault` (except that the `Unknown` values are not included). Behind the scenes, ClickHouse is actually calling the `dictGet` function for the `taxi_zone_dictionary` dictionary, but the `JOIN` syntax is more familiar for SQL developers. +::: 2. This query returns rows for the the 1000 trips with the highest tip amount, then performs an inner join of each row with the dictionary: - ```sql - SELECT * - FROM trips - JOIN taxi_zone_dictionary - ON trips.dropoff_nyct2010_gid = taxi_zone_dictionary.LocationID - WHERE tip_amount > 0 - ORDER BY tip_amount DESC - LIMIT 1000 - ``` - :::note - Generally, we avoid using `SELECT *` often in ClickHouse. You should only retrieve the columns you actually need. However, this query is slower for the purposes of the example. - ::: - +```sql +SELECT * +FROM trips +JOIN taxi_zone_dictionary +ON trips.dropoff_nyct2010_gid = taxi_zone_dictionary.LocationID +WHERE tip_amount > 0 +ORDER BY tip_amount DESC +LIMIT 1000 +``` +:::note +Generally, we avoid using `SELECT *` often in ClickHouse. You should only retrieve the columns you actually need. However, this query is slower for the purposes of the example. +:::
    ## Next steps {#next-steps} Learn more about ClickHouse with the following documentation: -- [Introduction to Primary Indexes in ClickHouse](./guides/best-practices/sparse-primary-indexes.md): Learn how ClickHouse uses sparse primary indexes to efficiently locate relevant data during queries. +- [Introduction to Primary Indexes in ClickHouse](./guides/best-practices/sparse-primary-indexes.md): Learn how ClickHouse uses sparse primary indexes to efficiently locate relevant data during queries. - [Integrate an external data source](/integrations/index.mdx): Review data source integration options, including files, Kafka, PostgreSQL, data pipelines, and many others. - [Visualize data in ClickHouse](./integrations/data-visualization/index.md): Connect your favorite UI/BI tool to ClickHouse. - [SQL Reference](./sql-reference/index.md): Browse the SQL functions available in ClickHouse for transforming, processing and analyzing data. - diff --git a/docs/use-cases/AI_ML/MCP/01_remote_mcp.md b/docs/use-cases/AI_ML/MCP/01_remote_mcp.md index bb6953d5807..9f30e0faf7a 100644 --- a/docs/use-cases/AI_ML/MCP/01_remote_mcp.md +++ b/docs/use-cases/AI_ML/MCP/01_remote_mcp.md @@ -25,74 +25,48 @@ import img7 from '@site/static/images/use-cases/AI_ML/MCP/7usage_mcp.png'; > This guide explains how to enable and use the ClickHouse Cloud Remote MCP Server. We will use Claude Code as an MCP Client for this example. -:::note +:::note The remote server capability is currently available in private preview only. Join the waitlist by filling out the form at [clickhouse.ai](https://www.clickhouse.ai) ::: - ## Enable the remote MCP server for your ClickHouse Cloud service {#enable-remote-mcp-server} - 1. Connect to your ClickHouse Cloud Service, click on the `Connect` button, and enable the Remote MCP Server for your Service - Select MCP in the Connect Modal - Enable MCP Server - 2. Copy the URL of the ClickHouse Cloud MCP Server from the `Connect` view or below - -```bash +```bash https://mcp.clickhouse.com/mcp ``` - ## Add the ClickHouse MCP Server in Claude Code {#add-clickhouse-mcp-server-claude-code} - 1. In your working directory, run the following command to add the ClickHouse Cloud MCP Server configuration to Claude Code. In this example, we named the MCP server in the Claude Code config `clickhouse_cloud` - ```bash claude mcp add --transport http clickhouse_cloud https://mcp.clickhouse.com/mcp ``` - 1b. Depending on the MCP Client used, you can also edit the JSON config directly - ```json { - "mcpServers": { - "clickhouse-remote": { - "url": "https://mcp.clickhouse.com/mcp" - } - } +"mcpServers": { +"clickhouse-remote": { +"url": "https://mcp.clickhouse.com/mcp" +} +} } ``` - 2. Launch Claude Code in your working directory - ```bash [user@host ~/Documents/repos/mcp_test] $ claude ``` - ## Authenticate to ClickHouse Cloud via OAuth {#authenticate-via-oauth} - 1. Claude Code will open a browser window on the firgst session. Otherwise, you can also trigger a connection by running the `/mcp` command in Claude Code and selecting the `clickhouse_cloud` MCP server - 2. Authenticate using your ClickHouse Cloud credentials - OAuth Connect flow - OAuth Connect flow success - ## Use the ClickHouse Cloud Remote MCP Server from Claude Code {#use-rempte-mcp-from-claude-code} - 1. Verify in Claude Code that the remote MCP server is connected - Claude Code MCP success - Claude Code MCP Details - 2. Congratulations! You can now use the ClickHouse Cloud Remote MCP Server from Claude Code - Claude Code MCP Usage - - diff --git a/docs/use-cases/AI_ML/MCP/02_claude-desktop.md b/docs/use-cases/AI_ML/MCP/02_claude-desktop.md index dc822b5615f..f264ebc5969 100644 --- a/docs/use-cases/AI_ML/MCP/02_claude-desktop.md +++ b/docs/use-cases/AI_ML/MCP/02_claude-desktop.md @@ -25,96 +25,70 @@ import ClaudeConversation from '@site/static/images/use-cases/AI_ML/MCP/claude-c - ## Install uv {#install-uv} - You will need to install [uv](https://docs.astral.sh/uv/) to follow the instructions in this guide. If you don't want to use uv, you will need to update the MCP Server config to use an alternative package manager. - ## Download Claude Desktop {#download-claude-desktop} - You'll also need to install the Claude Desktop app, which you can download from the [Claude Desktop website](https://claude.ai/desktop). - ## Configuring ClickHouse MCP server {#configure-clickhouse-mcp-server} - Once you've got Claude Desktop installed, it's time to configure the [ClickHouse MCP server](https://github.com/ClickHouse/mcp-clickhouse). We can do this via the [Claude Desktop configuration file](https://claude.ai/docs/configuration). - To find this file, first go to the settings page (`Cmd+,` on a Mac) and then click on the `Developer` tab on the left menu. You'll then see the following screen, on which you'll need to click on the `Edit config` button: - Claude Desktop configuration - This will take you to a directory containing the configuration file (`claude_desktop_config.json`). The first time you open that file, it will likely contain the following content: - ```json { - "mcpServers": {} +"mcpServers": {} } ``` - -The `mcpServers` dictionary takes in the name of an MCP Server as a key, and a dictionary of configuration options as a value. +The `mcpServers` dictionary takes in the name of an MCP Server as a key, and a dictionary of configuration options as a value. For example, the ClickHouse MCP server configuration connecting to the ClickHouse Playground would look like this: - ```json { - "mcpServers": { - "mcp-clickhouse": { - "command": "uv", - "args": [ - "run", - "--with", - "mcp-clickhouse", - "--python", - "3.10", - "mcp-clickhouse" - ], - "env": { - "CLICKHOUSE_HOST": "sql-clickhouse.clickhouse.com", - "CLICKHOUSE_PORT": "8443", - "CLICKHOUSE_USER": "demo", - "CLICKHOUSE_PASSWORD": "", - "CLICKHOUSE_SECURE": "true", - "CLICKHOUSE_VERIFY": "true", - "CLICKHOUSE_CONNECT_TIMEOUT": "30", - "CLICKHOUSE_SEND_RECEIVE_TIMEOUT": "30" - } - } - } +"mcpServers": { +"mcp-clickhouse": { +"command": "uv", +"args": [ +"run", +"--with", +"mcp-clickhouse", +"--python", +"3.10", +"mcp-clickhouse" +], +"env": { +"CLICKHOUSE_HOST": "sql-clickhouse.clickhouse.com", +"CLICKHOUSE_PORT": "8443", +"CLICKHOUSE_USER": "demo", +"CLICKHOUSE_PASSWORD": "", +"CLICKHOUSE_SECURE": "true", +"CLICKHOUSE_VERIFY": "true", +"CLICKHOUSE_CONNECT_TIMEOUT": "30", +"CLICKHOUSE_SEND_RECEIVE_TIMEOUT": "30" +} +} +} } ``` - -Once you've updated the config, you'll need to restart Claude Desktop for the changes to take effect. - +Once you've updated the config, you'll need to restart Claude Desktop for the changes to take effect. :::warning Depending on how you installed `uv`, you might receive the following error when restarting Claude Desktop: - ```text MCP mcp-clickhouse: spawn uv ENOENT ``` - If that happens, you'll need to update the `command` to have the full path to `uv`. e.g. if you've installed via Cargo, it will be `/Users//.cargo/bin/uv` ::: - ## Using ClickHouse MCP server {#using-clickhouse-mcp-server} - Once you've restarted Claude Desktop, you can find the ClickHouse MCP server by clicking on the `Search and tools` icon: - Find MCP servers
    - You can then choose whether to disable all or some of the tools. - Now we're ready to ask Claude some questions that will result in it using the ClickHouse MCP server. For example, we could ask it `What's the most interesting dataset in the SQL playground?`. - Claude will ask us to confirm the use of each tool in the MCP Server the first time that it's called: - Give permission to use the list_databases tool - Below you can see part of a conversation that includes some tool calls to the ClickHouse MCP Server: - Claude conversation -
    diff --git a/docs/use-cases/AI_ML/MCP/03_librechat.md b/docs/use-cases/AI_ML/MCP/03_librechat.md index f2c88b8a5e9..16c0bf8c3d9 100644 --- a/docs/use-cases/AI_ML/MCP/03_librechat.md +++ b/docs/use-cases/AI_ML/MCP/03_librechat.md @@ -20,39 +20,28 @@ import LibreInterface from '@site/static/images/use-cases/AI_ML/MCP/librechat.pn > and connect it to the ClickHouse example datasets. - ## Install docker {#install-docker} - You will need Docker to run LibreChat and the MCP server. To get Docker: 1. Visit [docker.com](https://www.docker.com/products/docker-desktop) 2. Download Docker desktop for your operating system 3. Install Docker by following the instructions for your operating system 4. Open Docker Desktop and ensure it is running -
    For more information, see the [Docker documentation](https://docs.docker.com/get-docker/). - ## Clone the LibreChat repository {#clone-librechat-repo} - -Open a terminal (command prompt, terminal or PowerShell) and clone the +Open a terminal (command prompt, terminal or PowerShell) and clone the LibreChat repository using the following command: - ```bash git clone https://github.com/danny-avila/LibreChat.git cd LibreChat ``` - ## Create and edit the .env file {#create-and-edit-env-file} - Copy the example configuration file from `.env.example` to `.env`: - ```bash cp .env.example .env ``` - -Open the `.env` file in your favorite text editor. You will see sections for -many popular LLM providers, including OpenAI, Anthropic, AWS bedrock etc, for +Open the `.env` file in your favorite text editor. You will see sections for +many popular LLM providers, including OpenAI, Anthropic, AWS bedrock etc, for example: - ```text title=".venv" #============# # Anthropic # @@ -62,57 +51,45 @@ ANTHROPIC_API_KEY=user_provided # ANTHROPIC_MODELS=claude-opus-4-20250514,claude-sonnet-4-20250514,claude-3-7-sonnet-20250219,claude-3-5-sonnet-20241022,claude-3-5-haiku-20241022,claude-3-opus-20240229,claude-3-sonnet-20240229,claude-3-haiku-20240307 # ANTHROPIC_REVERSE_PROXY= ``` - Replace `user_provided` with your API key for the LLM provider you want to use. - :::note Using a local LLM -If you don't have an API key you can use a local LLM like Ollama. You'll see how +If you don't have an API key you can use a local LLM like Ollama. You'll see how to do this later in step ["Install Ollama"](#add-local-llm-using-ollama). For now don't modify the .env file and continue with the next steps. ::: - ## Create a librechat.yaml file {#create-librechat-yaml-file} - Run the following command to create a new `librechat.yaml` file: - ```bash cp librechat.example.yaml librechat.yaml ``` - This creates the main [configuration file](https://www.librechat.ai/docs/configuration/librechat_yaml) for LibreChat. - ## Add ClickHouse MCP server to Docker compose {#add-clickhouse-mcp-server-to-docker-compose} - -Next we'll add the ClickHouse MCP server to the LibreChat Docker compose file -so that the LLM can interact with the +Next we'll add the ClickHouse MCP server to the LibreChat Docker compose file +so that the LLM can interact with the [ClickHouse SQL playground](https://sql.clickhouse.com/). - Create a file called `docker-compose.override.yml` and add the following configuration to it: - ```yml title="docker-compose.override.yml" services: - api: - volumes: - - ./librechat.yaml:/app/librechat.yaml - mcp-clickhouse: - image: mcp/clickhouse - container_name: mcp-clickhouse - ports: - - 8001:8000 - extra_hosts: - - "host.docker.internal:host-gateway" - environment: - - CLICKHOUSE_HOST=sql-clickhouse.clickhouse.com - - CLICKHOUSE_USER=demo - - CLICKHOUSE_PASSWORD= - - CLICKHOUSE_MCP_SERVER_TRANSPORT=sse - - CLICKHOUSE_MCP_BIND_HOST=0.0.0.0 +api: +volumes: +- ./librechat.yaml:/app/librechat.yaml +mcp-clickhouse: +image: mcp/clickhouse +container_name: mcp-clickhouse +ports: +- 8001:8000 +extra_hosts: +- "host.docker.internal:host-gateway" +environment: +- CLICKHOUSE_HOST=sql-clickhouse.clickhouse.com +- CLICKHOUSE_USER=demo +- CLICKHOUSE_PASSWORD= +- CLICKHOUSE_MCP_SERVER_TRANSPORT=sse +- CLICKHOUSE_MCP_BIND_HOST=0.0.0.0 ``` - If you want to explore your own data, you can do so by -using the [host, username and password](https://clickhouse.com/docs/getting-started/quick-start/cloud#connect-with-your-app) +using the [host, username and password](https://clickhouse.com/docs/getting-started/quick-start/cloud#connect-with-your-app) of your own ClickHouse Cloud service. - - ## Configure MCP server in librechat.yaml {#configure-mcp-server-in-librechat-yaml} - Open `librechat.yaml` and place the following configuration at the end of the file: - ```yml mcpServers: - clickhouse-playground: - type: sse - url: http://host.docker.internal:8001/sse +clickhouse-playground: +type: sse +url: http://host.docker.internal:8001/sse ``` - This configures LibreChat to connect to the MCP server running on Docker. - -Find the following line: - +Find the following line: ```text title="librechat.yaml" socialLogins: ['github', 'google', 'discord', 'openid', 'facebook', 'apple', 'saml'] ``` - For simplicity, we will remove the need to authenticate for now: - ```text title="librechat.yaml" socialLogins: [] ``` - ## Add a local LLM using Ollama (optional) {#add-local-llm-using-ollama} - ### Install Ollama {#install-ollama} - Go to the [Ollama website](https://ollama.com/download) and install Ollama for your system. - Once installed, you can run a model like this: - ```bash ollama run qwen3:32b ``` - This will pull the model to your local machine if it is not present. - For a list of models see the [Ollama library](https://ollama.com/library) - ### Configure Ollama in librechat.yaml {#configure-ollama-in-librechat-yaml} - Once the model has downloaded, configure it in `librechat.yaml`: - ```text title="librechat.yaml" custom: - - name: "Ollama" - apiKey: "ollama" - baseURL: "http://host.docker.internal:11434/v1/" - models: - default: - [ - "qwen3:32b" - ] - fetch: false - titleConvo: true - titleModel: "current_model" - summarize: false - summaryModel: "current_model" - forcePrompt: false - modelDisplayLabel: "Ollama" +- name: "Ollama" +apiKey: "ollama" +baseURL: "http://host.docker.internal:11434/v1/" +models: +default: +[ +"qwen3:32b" +] +fetch: false +titleConvo: true +titleModel: "current_model" +summarize: false +summaryModel: "current_model" +forcePrompt: false +modelDisplayLabel: "Ollama" ``` - ## Start all services {#start-all-services} - From the root of the LibreChat project folder, run the following command to start the services: - ```bash docker compose up ``` - Wait until all services are fully running. - ## Open LibreChat in your browser {#open-librechat-in-browser} - Once all services are up and running, open your browser and go to `http://localhost:3080/` - -Create a free LibreChat account if you don't yet have one, and sign in. You should +Create a free LibreChat account if you don't yet have one, and sign in. You should now see the LibreChat interface connected to the ClickHouse MCP server, and optionally, your local LLM. - From the chat interface, select `clickhouse-playground` as your MCP server: - Select your MCP server - You can now prompt the LLM to explore the ClickHouse example datasets. Give it a go: - ```text title="Prompt" What datasets do you have access to? ``` -
    diff --git a/docs/use-cases/AI_ML/MCP/ai_agent_libraries/slackbot.md b/docs/use-cases/AI_ML/MCP/ai_agent_libraries/slackbot.md index a2059362bb0..08f14922105 100644 --- a/docs/use-cases/AI_ML/MCP/ai_agent_libraries/slackbot.md +++ b/docs/use-cases/AI_ML/MCP/ai_agent_libraries/slackbot.md @@ -41,27 +41,27 @@ in the Slack documentation. ## Configure Slack app settings {#configure-slack-app-settings} - Go to `App Home` - - Under `Show Tabs` → `Messages Tab`: Enable `Allow users to send Slash commands and messages from the messages tab` - - Go to `Socket Mode` - - Enable `Socket Mode`**` - - Note down the `Socket Mode Handler`**` for the environment variable `SLACK_APP_TOKEN` - - Go to `OAuth & Permissions` - - Add the following `Bot Token Scopes`: - - `app_mentions:read` - - `assistant:write` - - `chat:write` - - `im:history` - - `im:read` - - `im:write` - - `channels:history` - - Install the app to your workspace and note down the `Bot User OAuth Token` for the environment variable `SLACK_BOT_TOKEN`. - - Go to `Event Subscriptions` - - Enable `Events` - - Under `Subscribe to bot events`, add: - - `app_mention` - - `assistant_thread_started` - - `message:im` - - Save Changes + - Under `Show Tabs` → `Messages Tab`: Enable `Allow users to send Slash commands and messages from the messages tab` + - Go to `Socket Mode` + - Enable `Socket Mode`**` + - Note down the `Socket Mode Handler`**` for the environment variable `SLACK_APP_TOKEN` + - Go to `OAuth & Permissions` + - Add the following `Bot Token Scopes`: + - `app_mentions:read` + - `assistant:write` + - `chat:write` + - `im:history` + - `im:read` + - `im:write` + - `channels:history` + - Install the app to your workspace and note down the `Bot User OAuth Token` for the environment variable `SLACK_BOT_TOKEN`. + - Go to `Event Subscriptions` + - Enable `Events` + - Under `Subscribe to bot events`, add: + - `app_mention` + - `assistant_thread_started` + - `message:im` + - Save Changes ## Add environment variables (`.env`) {#add-env-vars} diff --git a/docs/use-cases/data_lake/glue_catalog.md b/docs/use-cases/data_lake/glue_catalog.md index 472683bfeb6..026ec3ef852 100644 --- a/docs/use-cases/data_lake/glue_catalog.md +++ b/docs/use-cases/data_lake/glue_catalog.md @@ -14,37 +14,37 @@ import ExperimentalBadge from '@theme/badges/ExperimentalBadge'; -ClickHouse supports integration with multiple catalogs (Unity, Glue, Polaris, -etc.). In this guide, we will walk you through the steps to query your data in +ClickHouse supports integration with multiple catalogs (Unity, Glue, Polaris, +etc.). In this guide, we will walk you through the steps to query your data in S3 buckets using ClickHouse and the Glue Data Catalog. :::note -Glue supports many different table formats, but this integration only supports +Glue supports many different table formats, but this integration only supports Iceberg tables. ::: ## Configuring Glue in AWS {#configuring} -To connect to the glue catalog, you will need to identify the region of your -catalog and provide an access and secret key. +To connect to the glue catalog, you will need to identify the region of your +catalog and provide an access and secret key. :::note -Currently, the Glue catalog only supports access and secret keys, but we will +Currently, the Glue catalog only supports access and secret keys, but we will support additional authentication approaches in the future. ::: ## Creating a connection between Glue data catalog and ClickHouse {#connecting} -With your Unity Catalog configured and authentication in place, establish a +With your Unity Catalog configured and authentication in place, establish a connection between ClickHouse and Unity Catalog. ```sql title="Query" CREATE DATABASE glue ENGINE = DataLakeCatalog -SETTINGS - catalog_type = 'glue', - region = 'us-west-2', - aws_access_key_id = '', +SETTINGS + catalog_type = 'glue', + region = 'us-west-2', + aws_access_key_id = '', aws_secret_access_key = '' ``` @@ -63,146 +63,146 @@ SHOW TABLES; 2. │ iceberg-benchmark.hitsparquet │ 3. │ iceberg_benchmark.hitsdailypartitioned │ 4. │ iceberg_benchmark.time_travel │ - └────────────────────────────────────────┘ -``` + └────────────────────────────────────────┘ + ``` -You can see above that some tables above are not Iceberg tables, for instance -`iceberg-benchmark.hitsparquet`. You won't be able to query these as only Iceberg -is currently supported. + You can see above that some tables above are not Iceberg tables, for instance + `iceberg-benchmark.hitsparquet`. You won't be able to query these as only Iceberg + is currently supported. -To query a table: + To query a table: -```sql title="Query" -SELECT count(*) FROM `iceberg-benchmark.hitsiceberg`; -``` + ```sql title="Query" + SELECT count(*) FROM `iceberg-benchmark.hitsiceberg`; + ``` -:::note -Backticks are required because ClickHouse doesn't support more than one namespace. -::: + :::note + Backticks are required because ClickHouse doesn't support more than one namespace. + ::: -To inspect the table DDL, run the following query: + To inspect the table DDL, run the following query: -```sql -SHOW CREATE TABLE `iceberg-benchmark.hitsiceberg`; -``` + ```sql + SHOW CREATE TABLE `iceberg-benchmark.hitsiceberg`; + ``` -```sql title="Response" - ┌─statement───────────────────────────────────────────────┐ -1.│ CREATE TABLE glue.`iceberg-benchmark.hitsiceberg` │ - │ ( │ - │ `watchid` Nullable(Int64), │ - │ `javaenable` Nullable(Int32), │ - │ `title` Nullable(String), │ - │ `goodevent` Nullable(Int32), │ - │ `eventtime` Nullable(DateTime64(6)), │ - │ `eventdate` Nullable(Date), │ - │ `counterid` Nullable(Int32), │ - │ `clientip` Nullable(Int32), │ - │ `regionid` Nullable(Int32), │ - │ `userid` Nullable(Int64), │ - │ `counterclass` Nullable(Int32), │ - │ `os` Nullable(Int32), │ - │ `useragent` Nullable(Int32), │ - │ `url` Nullable(String), │ - │ `referer` Nullable(String), │ - │ `isrefresh` Nullable(Int32), │ - │ `referercategoryid` Nullable(Int32), │ - │ `refererregionid` Nullable(Int32), │ - │ `urlcategoryid` Nullable(Int32), │ - │ `urlregionid` Nullable(Int32), │ - │ `resolutionwidth` Nullable(Int32), │ - │ `resolutionheight` Nullable(Int32), │ - │ `resolutiondepth` Nullable(Int32), │ - │ `flashmajor` Nullable(Int32), │ - │ `flashminor` Nullable(Int32), │ - │ `flashminor2` Nullable(String), │ - │ `netmajor` Nullable(Int32), │ - │ `netminor` Nullable(Int32), │ - │ `useragentmajor` Nullable(Int32), │ - │ `useragentminor` Nullable(String), │ - │ `cookieenable` Nullable(Int32), │ - │ `javascriptenable` Nullable(Int32), │ - │ `ismobile` Nullable(Int32), │ - │ `mobilephone` Nullable(Int32), │ - │ `mobilephonemodel` Nullable(String), │ - │ `params` Nullable(String), │ - │ `ipnetworkid` Nullable(Int32), │ - │ `traficsourceid` Nullable(Int32), │ - │ `searchengineid` Nullable(Int32), │ - │ `searchphrase` Nullable(String), │ - │ `advengineid` Nullable(Int32), │ - │ `isartifical` Nullable(Int32), │ - │ `windowclientwidth` Nullable(Int32), │ - │ `windowclientheight` Nullable(Int32), │ - │ `clienttimezone` Nullable(Int32), │ - │ `clienteventtime` Nullable(DateTime64(6)), │ - │ `silverlightversion1` Nullable(Int32), │ - │ `silverlightversion2` Nullable(Int32), │ - │ `silverlightversion3` Nullable(Int32), │ - │ `silverlightversion4` Nullable(Int32), │ - │ `pagecharset` Nullable(String), │ - │ `codeversion` Nullable(Int32), │ - │ `islink` Nullable(Int32), │ - │ `isdownload` Nullable(Int32), │ - │ `isnotbounce` Nullable(Int32), │ - │ `funiqid` Nullable(Int64), │ - │ `originalurl` Nullable(String), │ - │ `hid` Nullable(Int32), │ - │ `isoldcounter` Nullable(Int32), │ - │ `isevent` Nullable(Int32), │ - │ `isparameter` Nullable(Int32), │ - │ `dontcounthits` Nullable(Int32), │ - │ `withhash` Nullable(Int32), │ - │ `hitcolor` Nullable(String), │ - │ `localeventtime` Nullable(DateTime64(6)), │ - │ `age` Nullable(Int32), │ - │ `sex` Nullable(Int32), │ - │ `income` Nullable(Int32), │ - │ `interests` Nullable(Int32), │ - │ `robotness` Nullable(Int32), │ - │ `remoteip` Nullable(Int32), │ - │ `windowname` Nullable(Int32), │ - │ `openername` Nullable(Int32), │ - │ `historylength` Nullable(Int32), │ - │ `browserlanguage` Nullable(String), │ - │ `browsercountry` Nullable(String), │ - │ `socialnetwork` Nullable(String), │ - │ `socialaction` Nullable(String), │ - │ `httperror` Nullable(Int32), │ - │ `sendtiming` Nullable(Int32), │ - │ `dnstiming` Nullable(Int32), │ - │ `connecttiming` Nullable(Int32), │ - │ `responsestarttiming` Nullable(Int32), │ - │ `responseendtiming` Nullable(Int32), │ - │ `fetchtiming` Nullable(Int32), │ - │ `socialsourcenetworkid` Nullable(Int32), │ - │ `socialsourcepage` Nullable(String), │ - │ `paramprice` Nullable(Int32), │ - │ `paramorderid` Nullable(String), │ - │ `paramcurrency` Nullable(String), │ - │ `paramcurrencyid` Nullable(Int32), │ - │ `openstatservicename` Nullable(String), │ - │ `openstatcampaignid` Nullable(String), │ - │ `openstatadid` Nullable(String), │ - │ `openstatsourceid` Nullable(String), │ - │ `utmsource` Nullable(String), │ - │ `utmmedium` Nullable(String), │ - │ `utmcampaign` Nullable(String), │ - │ `utmcontent` Nullable(String), │ - │ `utmterm` Nullable(String), │ - │ `fromtag` Nullable(String), │ - │ `hasgclid` Nullable(Int32), │ - │ `refererhash` Nullable(Int64), │ - │ `urlhash` Nullable(Int64), │ - │ `clid` Nullable(Int32) │ - │ ) │ - │ENGINE = Iceberg('s3://') │ - └─────────────────────────────────────────────────────────┘ -``` + ```sql title="Response" + ┌─statement───────────────────────────────────────────────┐ + 1.│ CREATE TABLE glue.`iceberg-benchmark.hitsiceberg` │ + │ ( │ + │ `watchid` Nullable(Int64), │ + │ `javaenable` Nullable(Int32), │ + │ `title` Nullable(String), │ + │ `goodevent` Nullable(Int32), │ + │ `eventtime` Nullable(DateTime64(6)), │ + │ `eventdate` Nullable(Date), │ + │ `counterid` Nullable(Int32), │ + │ `clientip` Nullable(Int32), │ + │ `regionid` Nullable(Int32), │ + │ `userid` Nullable(Int64), │ + │ `counterclass` Nullable(Int32), │ + │ `os` Nullable(Int32), │ + │ `useragent` Nullable(Int32), │ + │ `url` Nullable(String), │ + │ `referer` Nullable(String), │ + │ `isrefresh` Nullable(Int32), │ + │ `referercategoryid` Nullable(Int32), │ + │ `refererregionid` Nullable(Int32), │ + │ `urlcategoryid` Nullable(Int32), │ + │ `urlregionid` Nullable(Int32), │ + │ `resolutionwidth` Nullable(Int32), │ + │ `resolutionheight` Nullable(Int32), │ + │ `resolutiondepth` Nullable(Int32), │ + │ `flashmajor` Nullable(Int32), │ + │ `flashminor` Nullable(Int32), │ + │ `flashminor2` Nullable(String), │ + │ `netmajor` Nullable(Int32), │ + │ `netminor` Nullable(Int32), │ + │ `useragentmajor` Nullable(Int32), │ + │ `useragentminor` Nullable(String), │ + │ `cookieenable` Nullable(Int32), │ + │ `javascriptenable` Nullable(Int32), │ + │ `ismobile` Nullable(Int32), │ + │ `mobilephone` Nullable(Int32), │ + │ `mobilephonemodel` Nullable(String), │ + │ `params` Nullable(String), │ + │ `ipnetworkid` Nullable(Int32), │ + │ `traficsourceid` Nullable(Int32), │ + │ `searchengineid` Nullable(Int32), │ + │ `searchphrase` Nullable(String), │ + │ `advengineid` Nullable(Int32), │ + │ `isartifical` Nullable(Int32), │ + │ `windowclientwidth` Nullable(Int32), │ + │ `windowclientheight` Nullable(Int32), │ + │ `clienttimezone` Nullable(Int32), │ + │ `clienteventtime` Nullable(DateTime64(6)), │ + │ `silverlightversion1` Nullable(Int32), │ + │ `silverlightversion2` Nullable(Int32), │ + │ `silverlightversion3` Nullable(Int32), │ + │ `silverlightversion4` Nullable(Int32), │ + │ `pagecharset` Nullable(String), │ + │ `codeversion` Nullable(Int32), │ + │ `islink` Nullable(Int32), │ + │ `isdownload` Nullable(Int32), │ + │ `isnotbounce` Nullable(Int32), │ + │ `funiqid` Nullable(Int64), │ + │ `originalurl` Nullable(String), │ + │ `hid` Nullable(Int32), │ + │ `isoldcounter` Nullable(Int32), │ + │ `isevent` Nullable(Int32), │ + │ `isparameter` Nullable(Int32), │ + │ `dontcounthits` Nullable(Int32), │ + │ `withhash` Nullable(Int32), │ + │ `hitcolor` Nullable(String), │ + │ `localeventtime` Nullable(DateTime64(6)), │ + │ `age` Nullable(Int32), │ + │ `sex` Nullable(Int32), │ + │ `income` Nullable(Int32), │ + │ `interests` Nullable(Int32), │ + │ `robotness` Nullable(Int32), │ + │ `remoteip` Nullable(Int32), │ + │ `windowname` Nullable(Int32), │ + │ `openername` Nullable(Int32), │ + │ `historylength` Nullable(Int32), │ + │ `browserlanguage` Nullable(String), │ + │ `browsercountry` Nullable(String), │ + │ `socialnetwork` Nullable(String), │ + │ `socialaction` Nullable(String), │ + │ `httperror` Nullable(Int32), │ + │ `sendtiming` Nullable(Int32), │ + │ `dnstiming` Nullable(Int32), │ + │ `connecttiming` Nullable(Int32), │ + │ `responsestarttiming` Nullable(Int32), │ + │ `responseendtiming` Nullable(Int32), │ + │ `fetchtiming` Nullable(Int32), │ + │ `socialsourcenetworkid` Nullable(Int32), │ + │ `socialsourcepage` Nullable(String), │ + │ `paramprice` Nullable(Int32), │ + │ `paramorderid` Nullable(String), │ + │ `paramcurrency` Nullable(String), │ + │ `paramcurrencyid` Nullable(Int32), │ + │ `openstatservicename` Nullable(String), │ + │ `openstatcampaignid` Nullable(String), │ + │ `openstatadid` Nullable(String), │ + │ `openstatsourceid` Nullable(String), │ + │ `utmsource` Nullable(String), │ + │ `utmmedium` Nullable(String), │ + │ `utmcampaign` Nullable(String), │ + │ `utmcontent` Nullable(String), │ + │ `utmterm` Nullable(String), │ + │ `fromtag` Nullable(String), │ + │ `hasgclid` Nullable(Int32), │ + │ `refererhash` Nullable(Int64), │ + │ `urlhash` Nullable(Int64), │ + │ `clid` Nullable(Int32) │ + │ ) │ + │ENGINE = Iceberg('s3://') │ + └─────────────────────────────────────────────────────────┘ + ``` ## Loading data from your Data Lake into ClickHouse {#loading-data-into-clickhouse} -If you need to load data from Databricks into ClickHouse, start by creating a +If you need to load data from Databricks into ClickHouse, start by creating a local ClickHouse table: ```sql title="Query" @@ -320,6 +320,6 @@ PRIMARY KEY (CounterID, EventDate, UserID, EventTime, WatchID); Then load the data from your Iceberg table: ```sql title="Query" -INSERT INTO default.hits +INSERT INTO default.hits SELECT * FROM glue.`iceberg-benchmark.hitsiceberg`; ``` diff --git a/docs/use-cases/data_lake/unity_catalog.md b/docs/use-cases/data_lake/unity_catalog.md index 5dcbaa6a46b..900c7fa9c0c 100644 --- a/docs/use-cases/data_lake/unity_catalog.md +++ b/docs/use-cases/data_lake/unity_catalog.md @@ -19,7 +19,7 @@ Integration with the Unity Catalog works for managed and external tables. This integration is currently only supported on AWS. ::: -ClickHouse supports integration with multiple catalogs (Unity, Glue, Polaris, etc.). This guide will walk you through the steps to query your data managed by Databricks using ClickHouse and the [Unity Catalog](https://www.databricks.com/product/unity-catalog). +ClickHouse supports integration with multiple catalogs (Unity, Glue, Polaris, etc.). This guide will walk you through the steps to query your data managed by Databricks using ClickHouse and the [Unity Catalog](https://www.databricks.com/product/unity-catalog). Databricks supports multiple data formats for their lakehouse. With ClickHouse, you can query Unity Catalog tables as both Delta and Iceberg. @@ -40,7 +40,6 @@ Once your catalog is configured, you must generate credentials for ClickHouse. T * For Delta clients, use a Personal Access Token ([PAT](https://docs.databricks.com/aws/en/dev-tools/auth/pat)). - ## Creating a connection between Unity Catalog and ClickHouse {#creating-a-connection-between-unity-catalog-and-clickhouse} With your Unity Catalog configured and authentication in place, establish a connection between ClickHouse and Unity Catalog. @@ -58,7 +57,7 @@ SETTINGS warehouse = 'CATALOG_NAME', catalog_credential = '', catalog_type ```sql CREATE DATABASE unity ENGINE = DataLakeCatalog('https://.cloud.databricks.com/api/2.1/unity-catalog/iceberg') -SETTINGS catalog_type = 'rest', catalog_credential = ':', warehouse = 'workspace', +SETTINGS catalog_type = 'rest', catalog_credential = ':', warehouse = 'workspace', oauth_server_uri = 'https://.cloud.databricks.com/oidc/v1/token', auth_scope = 'all-apis,sql' ``` diff --git a/docs/use-cases/observability/build-your-own/grafana.md b/docs/use-cases/observability/build-your-own/grafana.md index 0cb500048b4..94ca4442368 100644 --- a/docs/use-cases/observability/build-your-own/grafana.md +++ b/docs/use-cases/observability/build-your-own/grafana.md @@ -39,7 +39,6 @@ The Traces configuration is slightly more complex (full list [here](/engines/tab Connector config - Once configured users can navigate to [Grafana Explore](https://grafana.com/docs/grafana/latest/explore/) and begin searching logs and traces. ## Logs {#logs} @@ -168,22 +167,22 @@ Multi-line charts will be automatically rendered for a query provided the follow - field 2: value to group by. This should be a String. - field 3+: the metric values -For example: - -```sql -SELECT - $__timeInterval(Timestamp) as time, - ServiceName, - quantile(0.99)(Duration)/1000000 AS p99 -FROM otel_traces -WHERE $__timeFilter(Timestamp) -AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime ) -GROUP BY ServiceName, time -ORDER BY time ASC -LIMIT 100000 -``` - -Multi-line charts + For example: + + ```sql + SELECT + $__timeInterval(Timestamp) as time, + ServiceName, + quantile(0.99)(Duration)/1000000 AS p99 + FROM otel_traces + WHERE $__timeFilter(Timestamp) + AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime ) + GROUP BY ServiceName, time + ORDER BY time ASC + LIMIT 100000 + ``` + + Multi-line charts ### Visualizing geo data {#visualizing-geo-data} diff --git a/docs/use-cases/observability/build-your-own/integrating-opentelemetry.md b/docs/use-cases/observability/build-your-own/integrating-opentelemetry.md index 00517e5c3ae..34c7756084d 100644 --- a/docs/use-cases/observability/build-your-own/integrating-opentelemetry.md +++ b/docs/use-cases/observability/build-your-own/integrating-opentelemetry.md @@ -30,7 +30,7 @@ OpenTelemetry consists of a number of components. As well as providing a data an - The [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) is a proxy that receives, processes, and exports telemetry data. A ClickHouse-powered solution uses this component for both log collection and event processing prior to batching and inserting. - [Language SDKs](https://opentelemetry.io/docs/languages/) that implement the specification, APIs, and export of telemetry data. These SDKs effectively ensure traces are correctly recorded within an application's code, generating constituent spans and ensuring context is propagated across services through metadata - thus formulating distributed traces and ensuring spans can be correlated. These SDKs are complemented by an ecosystem that automatically implements common libraries and frameworks, thus meaning the user is not required to change their code and obtains out-of-the-box instrumentation. -A ClickHouse-powered Observability solution exploits both of these tools. + A ClickHouse-powered Observability solution exploits both of these tools. ## Distributions {#distributions} @@ -41,7 +41,7 @@ This distribution contains many components and allows users to experiment with v - Reduce the size of the collector, reducing deployment times for the collector - Improve the security of the collector by reducing the available attack surface area -Building a [custom collector](https://opentelemetry.io/docs/collector/custom-collector/) can be achieved using the [OpenTelemetry Collector Builder](https://github.com/open-telemetry/opentelemetry-collector/tree/main/cmd/builder). + Building a [custom collector](https://opentelemetry.io/docs/collector/custom-collector/) can be achieved using the [OpenTelemetry Collector Builder](https://github.com/open-telemetry/opentelemetry-collector/tree/main/cmd/builder). ## Ingesting data with OTel {#ingesting-data-with-otel} @@ -49,11 +49,10 @@ Building a [custom collector](https://opentelemetry.io/docs/collector/custom-col In order to collect logs and insert them into ClickHouse, we recommend using the OpenTelemetry Collector. The OpenTelemetry Collector can be deployed in two principal roles: - - **Agent** - Agent instances collect data at the edge e.g. on servers or on Kubernetes nodes, or receive events directly from applications - instrumented with an OpenTelemetry SDK. In the latter case, the agent instance runs with the application or on the same host as the application (such as a sidecar or a DaemonSet). Agents can either send their data directly to ClickHouse or to a gateway instance. In the former case, this is referred to as [Agent deployment pattern](https://opentelemetry.io/docs/collector/deployment/agent/). - **Gateway** - Gateway instances provide a standalone service (for example, a deployment in Kubernetes), typically per cluster, per data center, or per region. These receive events from applications (or other collectors as agents) via a single OTLP endpoint. Typically, a set of gateway instances are deployed, with an out-of-the-box load balancer used to distribute the load amongst them. If all agents and applications send their signals to this single endpoint, it is often referred to as a [Gateway deployment pattern](https://opentelemetry.io/docs/collector/deployment/gateway/). -Below we assume a simple agent collector, sending its events directly to ClickHouse. See [Scaling with Gateways](#scaling-with-gateways) for further details on using gateways and when they are applicable. + Below we assume a simple agent collector, sending its events directly to ClickHouse. See [Scaling with Gateways](#scaling-with-gateways) for further details on using gateways and when they are applicable. ### Collecting logs {#collecting-logs} @@ -75,13 +74,13 @@ This approach requires users to instrument their code with their [appropriate la - **Scraping via Filelog receiver** - This receiver tails files on disk and formulates log messages, sending these to ClickHouse. This receiver handles complex tasks such as detecting multi-line messages, handling log rollovers, checkpointing for robustness to restart, and extracting structure. This receiver is additionally able to tail Docker and Kubernetes container logs, deployable as a helm chart, [extracting the structure from these](https://opentelemetry.io/blog/2024/otel-collector-container-log-parser/) and enriching them with the pod details. -File log receiver + File log receiver -**Most deployments will use a combination of the above receivers. We recommend users read the [collector documentation](https://opentelemetry.io/docs/collector/) and familiarize themselves with the basic concepts, along with [the configuration structure](https://opentelemetry.io/docs/collector/configuration/) and [installation methods](https://opentelemetry.io/docs/collector/installation/).** + **Most deployments will use a combination of the above receivers. We recommend users read the [collector documentation](https://opentelemetry.io/docs/collector/) and familiarize themselves with the basic concepts, along with [the configuration structure](https://opentelemetry.io/docs/collector/configuration/) and [installation methods](https://opentelemetry.io/docs/collector/installation/).** -:::note Tip: `otelbin.io` -[`otelbin.io`](https://www.otelbin.io/) is useful to validate and visualize configurations. -::: + :::note Tip: `otelbin.io` + [`otelbin.io`](https://www.otelbin.io/) is useful to validate and visualize configurations. + ::: ## Structured vs unstructured {#structured-vs-unstructured} @@ -117,19 +116,19 @@ For example purposes, we provide a structured (JSON) and unstructured logging da - [Unstructured](https://datasets-documentation.s3.eu-west-3.amazonaws.com/http_logs/access-unstructured.log.gz) - [Structured](https://datasets-documentation.s3.eu-west-3.amazonaws.com/http_logs/access-structured.log.gz) -We use the structured dataset for the example below. Ensure this file is downloaded and extracted to reproduce the following examples. + We use the structured dataset for the example below. Ensure this file is downloaded and extracted to reproduce the following examples. -The following represents a simple configuration for the OTel Collector which reads these files on disk, using the filelog receiver, and outputs the resulting messages to stdout. We use the [`json_parser`](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/stanza/docs/operators/json_parser.md) operator since our logs are structured. Modify the path to the access-structured.log file. + The following represents a simple configuration for the OTel Collector which reads these files on disk, using the filelog receiver, and outputs the resulting messages to stdout. We use the [`json_parser`](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/stanza/docs/operators/json_parser.md) operator since our logs are structured. Modify the path to the access-structured.log file. -:::note Consider ClickHouse for parsing -The below example extracts the timestamp from the log. This requires the use of the `json_parser` operator, which converts the entire log line to a JSON string, placing the result in `LogAttributes`. This can be computationally expensive and [can be done more efficiently in ClickHouse](https://clickhouse.com/blog/worlds-fastest-json-querying-tool-clickhouse-local) - [Extracting structure with SQL](/use-cases/observability/schema-design#extracting-structure-with-sql). An equivalent unstructured example, which uses the [`regex_parser`](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/stanza/docs/operators/regex_parser.md) to achieve this, can be found [here](https://pastila.nl/?01da7ee2/2ffd3ba8124a7d6e4ddf39422ad5b863#swBkiAXvGP7mRPgbuzzHFA==). -::: + :::note Consider ClickHouse for parsing + The below example extracts the timestamp from the log. This requires the use of the `json_parser` operator, which converts the entire log line to a JSON string, placing the result in `LogAttributes`. This can be computationally expensive and [can be done more efficiently in ClickHouse](https://clickhouse.com/blog/worlds-fastest-json-querying-tool-clickhouse-local) - [Extracting structure with SQL](/use-cases/observability/schema-design#extracting-structure-with-sql). An equivalent unstructured example, which uses the [`regex_parser`](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/stanza/docs/operators/regex_parser.md) to achieve this, can be found [here](https://pastila.nl/?01da7ee2/2ffd3ba8124a7d6e4ddf39422ad5b863#swBkiAXvGP7mRPgbuzzHFA==). + ::: -**[config-structured-logs.yaml](https://www.otelbin.io/#config=receivers%3A*N_filelog%3A*N___include%3A*N_____-_%2Fopt%2Fdata%2Flogs%2Faccess-structured.log*N___start*_at%3A_beginning*N___operators%3A*N_____-_type%3A_json*_parser*N_______timestamp%3A*N_________parse*_from%3A_attributes.time*_local*N_________layout%3A_*%22*.Y-*.m-*.d_*.H%3A*.M%3A*.S*%22*N*N*Nprocessors%3A*N__batch%3A*N____timeout%3A_5s*N____send*_batch*_size%3A_1*N*N*Nexporters%3A*N_logging%3A*N___loglevel%3A_debug*N*N*Nservice%3A*N_pipelines%3A*N___logs%3A*N_____receivers%3A_%5Bfilelog%5D*N_____processors%3A_%5Bbatch%5D*N_____exporters%3A_%5Blogging%5D%7E)** + **[config-structured-logs.yaml](https://www.otelbin.io/#config=receivers%3A*N_filelog%3A*N___include%3A*N_____-_%2Fopt%2Fdata%2Flogs%2Faccess-structured.log*N___start*_at%3A_beginning*N___operators%3A*N_____-_type%3A_json*_parser*N_______timestamp%3A*N_________parse*_from%3A_attributes.time*_local*N_________layout%3A_*%22*.Y-*.m-*.d_*.H%3A*.M%3A*.S*%22*N*N*Nprocessors%3A*N__batch%3A*N____timeout%3A_5s*N____send*_batch*_size%3A_1*N*N*Nexporters%3A*N_logging%3A*N___loglevel%3A_debug*N*N*Nservice%3A*N_pipelines%3A*N___logs%3A*N_____receivers%3A_%5Bfilelog%5D*N_____processors%3A_%5Bbatch%5D*N_____exporters%3A_%5Blogging%5D%7E)** -```yaml -receivers: - filelog: + ```yaml + receivers: + filelog: include: - /opt/data/logs/access-structured.log start_at: beginning @@ -138,39 +137,39 @@ receivers: timestamp: parse_from: attributes.time_local layout: '%Y-%m-%d %H:%M:%S' -processors: - batch: + processors: + batch: timeout: 5s send_batch_size: 1 -exporters: - logging: + exporters: + logging: loglevel: debug -service: - pipelines: + service: + pipelines: logs: receivers: [filelog] processors: [batch] exporters: [logging] -``` + ``` -Users can follow the [official instructions](https://opentelemetry.io/docs/collector/installation/) to install the collector locally. Importantly, ensure the instructions are modified to use the [contrib distribution](https://github.com/open-telemetry/opentelemetry-collector-releases/tree/main/distributions/otelcol-contrib) (which contains the `filelog` receiver) e.g. instead of `otelcol_0.102.1_darwin_arm64.tar.gz` users would download `otelcol-contrib_0.102.1_darwin_arm64.tar.gz`. Releases can be found [here](https://github.com/open-telemetry/opentelemetry-collector-releases/releases). + Users can follow the [official instructions](https://opentelemetry.io/docs/collector/installation/) to install the collector locally. Importantly, ensure the instructions are modified to use the [contrib distribution](https://github.com/open-telemetry/opentelemetry-collector-releases/tree/main/distributions/otelcol-contrib) (which contains the `filelog` receiver) e.g. instead of `otelcol_0.102.1_darwin_arm64.tar.gz` users would download `otelcol-contrib_0.102.1_darwin_arm64.tar.gz`. Releases can be found [here](https://github.com/open-telemetry/opentelemetry-collector-releases/releases). -Once installed, the OTel Collector can be run with the following commands: + Once installed, the OTel Collector can be run with the following commands: -```bash -./otelcol-contrib --config config-logs.yaml -``` + ```bash + ./otelcol-contrib --config config-logs.yaml + ``` -Assuming the use of the structured logs, messages will take the following form on the output: + Assuming the use of the structured logs, messages will take the following form on the output: -```response -LogRecord #98 -ObservedTimestamp: 2024-06-19 13:21:16.414259 +0000 UTC -Timestamp: 2019-01-22 01:12:53 +0000 UTC -SeverityText: -SeverityNumber: Unspecified(0) -Body: Str({"remote_addr":"66.249.66.195","remote_user":"-","run_time":"0","time_local":"2019-01-22 01:12:53.000","request_type":"GET","request_path":"\/product\/7564","request_protocol":"HTTP\/1.1","status":"301","size":"178","referer":"-","user_agent":"Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/41.0.2272.96 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)"}) -Attributes: + ```response + LogRecord #98 + ObservedTimestamp: 2024-06-19 13:21:16.414259 +0000 UTC + Timestamp: 2019-01-22 01:12:53 +0000 UTC + SeverityText: + SeverityNumber: Unspecified(0) + Body: Str({"remote_addr":"66.249.66.195","remote_user":"-","run_time":"0","time_local":"2019-01-22 01:12:53.000","request_type":"GET","request_path":"\/product\/7564","request_protocol":"HTTP\/1.1","status":"301","size":"178","referer":"-","user_agent":"Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/41.0.2272.96 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)"}) + Attributes: -> remote_user: Str(-) -> request_protocol: Str(HTTP/1.1) -> time_local: Str(2019-01-22 01:12:53.000) @@ -183,24 +182,24 @@ Attributes: -> request_type: Str(GET) -> request_path: Str(/product/7564) -> run_time: Str(0) -Trace ID: -Span ID: -Flags: 0 -``` + Trace ID: + Span ID: + Flags: 0 + ``` -The above represents a single log message as produced by the OTel collector. We ingest these same messages into ClickHouse in later sections. + The above represents a single log message as produced by the OTel collector. We ingest these same messages into ClickHouse in later sections. -The full schema of log messages, along with additional columns which may be present if using other receivers, is maintained [here](https://opentelemetry.io/docs/specs/otel/logs/data-model/). **We strongly recommend users familiarize themselves with this schema.** + The full schema of log messages, along with additional columns which may be present if using other receivers, is maintained [here](https://opentelemetry.io/docs/specs/otel/logs/data-model/). **We strongly recommend users familiarize themselves with this schema.** -The key here is that the log line itself is held as a string within the `Body` field but the JSON has been auto-extracted to the Attributes field thanks to the `json_parser`. This same [operator](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/stanza/docs/operators/README.md#what-operators-are-available) has been used to extract the timestamp to the appropriate `Timestamp` column. For recommendations on processing logs with OTel see [Processing](#processing---filtering-transforming-and-enriching). + The key here is that the log line itself is held as a string within the `Body` field but the JSON has been auto-extracted to the Attributes field thanks to the `json_parser`. This same [operator](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/stanza/docs/operators/README.md#what-operators-are-available) has been used to extract the timestamp to the appropriate `Timestamp` column. For recommendations on processing logs with OTel see [Processing](#processing---filtering-transforming-and-enriching). -:::note Operators -Operators are the most basic unit of log processing. Each operator fulfills a single responsibility, such as reading lines from a file or parsing JSON from a field. Operators are then chained together in a pipeline to achieve the desired result. -::: + :::note Operators + Operators are the most basic unit of log processing. Each operator fulfills a single responsibility, such as reading lines from a file or parsing JSON from a field. Operators are then chained together in a pipeline to achieve the desired result. + ::: -The above messages don't have a `TraceID` or `SpanID` field. If present, e.g. in cases where users are implementing [distributed tracing](https://opentelemetry.io/docs/concepts/observability-primer/#distributed-traces), these could be extracted from the JSON using the same techniques shown above. + The above messages don't have a `TraceID` or `SpanID` field. If present, e.g. in cases where users are implementing [distributed tracing](https://opentelemetry.io/docs/concepts/observability-primer/#distributed-traces), these could be extracted from the JSON using the same techniques shown above. -For users needing to collect local or Kubernetes log files, we recommend users become familiar with the configuration options available for the [filelog receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/filelogreceiver/README.md#configuration) and how [offsets](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver#offset-tracking) and [multiline log parsing is handled](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver#example---multiline-logs-parsing). + For users needing to collect local or Kubernetes log files, we recommend users become familiar with the configuration options available for the [filelog receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/filelogreceiver/README.md#configuration) and how [offsets](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver#offset-tracking) and [multiline log parsing is handled](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver#example---multiline-logs-parsing). ## Collecting Kubernetes logs {#collecting-kubernetes-logs} @@ -288,9 +287,9 @@ As demonstrated in the earlier example of setting the timestamp for a log event, - **Operators** - [Operators](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/stanza/docs/operators/README.md) provide the most basic unit of processing available at the receiver. Basic parsing is supported, allowing fields such as the Severity and Timestamp to be set. JSON and regex parsing are supported here along with event filtering and basic transformations. We recommend performing event filtering here. -We recommend users avoid doing excessive event processing using operators or [transform processors](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/transformprocessor/README.md). These can incur considerable memory and CPU overhead, especially JSON parsing. It is possible to do all processing in ClickHouse at insert time with materialized views and columns with some exceptions - specifically, context-aware enrichment e.g. adding of k8s metadata. For more details see [Extracting structure with SQL](/use-cases/observability/schema-design#extracting-structure-with-sql). + We recommend users avoid doing excessive event processing using operators or [transform processors](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/transformprocessor/README.md). These can incur considerable memory and CPU overhead, especially JSON parsing. It is possible to do all processing in ClickHouse at insert time with materialized views and columns with some exceptions - specifically, context-aware enrichment e.g. adding of k8s metadata. For more details see [Extracting structure with SQL](/use-cases/observability/schema-design#extracting-structure-with-sql). -If processing is done using the OTel collector, we recommend doing transformations at gateway instances and minimizing any work done at agent instances. This will ensure the resources required by agents at the edge, running on servers, are as minimal as possible. Typically, we see users only performing filtering (to minimize unnecessary network usage), timestamp setting (via operators), and enrichment, which requires context in agents. For example, if gateway instances reside in a different Kubernetes cluster, k8s enrichment will need to occur in the agent. + If processing is done using the OTel collector, we recommend doing transformations at gateway instances and minimizing any work done at agent instances. This will ensure the resources required by agents at the edge, running on servers, are as minimal as possible. Typically, we see users only performing filtering (to minimize unnecessary network usage), timestamp setting (via operators), and enrichment, which requires context in agents. For example, if gateway instances reside in a different Kubernetes cluster, k8s enrichment will need to occur in the agent. ### Example {#example-2} @@ -303,32 +302,32 @@ receivers: filelog: include: - /opt/data/logs/access-unstructured.log - start_at: beginning - operators: + start_at: beginning + operators: - type: regex_parser - regex: '^(?P[\d.]+)\s+-\s+-\s+\[(?P[^\]]+)\]\s+"(?P[A-Z]+)\s+(?P[^\s]+)\s+HTTP/[^\s]+"\s+(?P\d+)\s+(?P\d+)\s+"(?P[^"]*)"\s+"(?P[^"]*)"' - timestamp: + regex: '^(?P[\d.]+)\s+-\s+-\s+\[(?P[^\]]+)\]\s+"(?P[A-Z]+)\s+(?P[^\s]+)\s+HTTP/[^\s]+"\s+(?P\d+)\s+(?P\d+)\s+"(?P[^"]*)"\s+"(?P[^"]*)"' + timestamp: parse_from: attributes.timestamp layout: '%d/%b/%Y:%H:%M:%S %z' #22/Jan/2019:03:56:14 +0330 -processors: - batch: - timeout: 1s - send_batch_size: 100 - memory_limiter: - check_interval: 1s - limit_mib: 2048 - spike_limit_mib: 256 -exporters: - logging: - loglevel: debug -service: - pipelines: - logs: - receivers: [filelog] - processors: [batch, memory_limiter] - exporters: [logging] -``` + processors: + batch: + timeout: 1s + send_batch_size: 100 + memory_limiter: + check_interval: 1s + limit_mib: 2048 + spike_limit_mib: 256 + exporters: + logging: + loglevel: debug + service: + pipelines: + logs: + receivers: [filelog] + processors: [batch, memory_limiter] + exporters: [logging] + ``` ```bash ./otelcol-contrib --config config-unstructured-logs-with-processor.yaml @@ -351,23 +350,23 @@ receivers: filelog: include: - /opt/data/logs/access-structured.log - start_at: beginning - operators: + start_at: beginning + operators: - type: json_parser - timestamp: + timestamp: parse_from: attributes.time_local layout: '%Y-%m-%d %H:%M:%S' - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 -processors: - batch: - timeout: 5s - send_batch_size: 5000 -exporters: - clickhouse: - endpoint: tcp://localhost:9000?dial_timeout=10s&compress=lz4&async_insert=1 + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + processors: + batch: + timeout: 5s + send_batch_size: 5000 + exporters: + clickhouse: + endpoint: tcp://localhost:9000?dial_timeout=10s&compress=lz4&async_insert=1 # ttl: 72h traces_table_name: otel_traces logs_table_name: otel_logs @@ -382,7 +381,6 @@ exporters: max_interval: 30s max_elapsed_time: 300s - service: pipelines: logs: @@ -400,7 +398,7 @@ Note the following key settings: - **pipelines** - The above configuration highlights the use of [pipelines](https://opentelemetry.io/docs/collector/configuration/#pipelines), consisting of a set of receivers, processors and exporters with one for logs and traces. - **endpoint** - Communication with ClickHouse is configured via the `endpoint` parameter. The connection string `tcp://localhost:9000?dial_timeout=10s&compress=lz4&async_insert=1` causes communication to occur over TCP. If users prefer HTTP for traffic-switching reasons, modify this connection string as described [here](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/exporter/clickhouseexporter/README.md#configuration-options). Full connection details, with the ability to specify a username and password within this connection string, are described [here](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/exporter/clickhouseexporter/README.md#configuration-options). -**Important:** Note the above connection string enables both compression (lz4) as well as asynchronous inserts. We recommend both are always enabled. See [Batching](#batching) for further details on asynchronous inserts. Compression should always be specified and will not by default be enabled by default on older versions of the exporter. + **Important:** Note the above connection string enables both compression (lz4) as well as asynchronous inserts. We recommend both are always enabled. See [Batching](#batching) for further details on asynchronous inserts. Compression should always be specified and will not by default be enabled by default on older versions of the exporter. - **ttl** - the value here determines how long data is retained. Further details in "Managing data". This should be specified as a time unit in hours e.g. 72h. We disable TTL in the example below since our data is from 2019 and will be removed by ClickHouse immediately if inserted. - **traces_table_name** and **logs_table_name** - determines the name of the logs and traces table. @@ -410,80 +408,79 @@ Note the following key settings: - **batch** - a batch processor ensures events are sent as batches. We recommend a value of around 5000 with a timeout of 5s. Whichever of these is reached first will initiate a batch to be flushed to the exporter. Lowering these values will mean a lower latency pipeline with data available for querying sooner, at the expense of more connections and batches sent to ClickHouse. This is not recommended if users are not using [asynchronous inserts](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse) as it may cause issues with [too many parts](https://clickhouse.com/blog/common-getting-started-issues-with-clickhouse#1-too-many-parts) in ClickHouse. Conversely, if users are using asynchronous inserts these availability data for querying will also be dependent on asynchronous insert settings - although data will still be flushed from the connector sooner. See [Batching](#batching) for more details. - **sending_queue** - controls the size of the sending queue. Each item in the queue contains a batch. If this queue is exceeded e.g. due to ClickHouse being unreachable but events continue to arrive, batches will be dropped. -Assuming users have extracted the structured log file and have a [local instance of ClickHouse](/install) running (with default authentication), users can run this configuration via the command: - -```bash -./otelcol-contrib --config clickhouse-config.yaml -``` - -To send trace data to this collector, run the following command using the `telemetrygen` tool: - -```bash -$GOBIN/telemetrygen traces --otlp-insecure --traces 300 -``` - -Once running, confirm log events are present with a simple query: - -```sql -SELECT * -FROM otel_logs -LIMIT 1 -FORMAT Vertical - -Row 1: -────── -Timestamp: 2019-01-22 06:46:14.000000000 -TraceId: -SpanId: -TraceFlags: 0 -SeverityText: -SeverityNumber: 0 -ServiceName: -Body: {"remote_addr":"109.230.70.66","remote_user":"-","run_time":"0","time_local":"2019-01-22 06:46:14.000","request_type":"GET","request_path":"\/image\/61884\/productModel\/150x150","request_protocol":"HTTP\/1.1","status":"200","size":"1684","referer":"https:\/\/www.zanbil.ir\/filter\/p3%2Cb2","user_agent":"Mozilla\/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko\/20100101 Firefox\/64.0"} -ResourceSchemaUrl: -ResourceAttributes: {} -ScopeSchemaUrl: -ScopeName: -ScopeVersion: -ScopeAttributes: {} -LogAttributes: {'referer':'https://www.zanbil.ir/filter/p3%2Cb2','log.file.name':'access-structured.log','run_time':'0','remote_user':'-','request_protocol':'HTTP/1.1','size':'1684','user_agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0','remote_addr':'109.230.70.66','request_path':'/image/61884/productModel/150x150','status':'200','time_local':'2019-01-22 06:46:14.000','request_type':'GET'} - -1 row in set. Elapsed: 0.012 sec. Processed 5.04 thousand rows, 4.62 MB (414.14 thousand rows/s., 379.48 MB/s.) -Peak memory usage: 5.41 MiB. - - -Likewise, for trace events, users can check the `otel_traces` table: - -SELECT * -FROM otel_traces -LIMIT 1 -FORMAT Vertical - -Row 1: -────── -Timestamp: 2024-06-20 11:36:41.181398000 -TraceId: 00bba81fbd38a242ebb0c81a8ab85d8f -SpanId: beef91a2c8685ace -ParentSpanId: -TraceState: -SpanName: lets-go -SpanKind: SPAN_KIND_CLIENT -ServiceName: telemetrygen -ResourceAttributes: {'service.name':'telemetrygen'} -ScopeName: telemetrygen -ScopeVersion: -SpanAttributes: {'peer.service':'telemetrygen-server','net.peer.ip':'1.2.3.4'} -Duration: 123000 -StatusCode: STATUS_CODE_UNSET -StatusMessage: -Events.Timestamp: [] -Events.Name: [] -Events.Attributes: [] -Links.TraceId: [] -Links.SpanId: [] -Links.TraceState: [] -Links.Attributes: [] -``` + Assuming users have extracted the structured log file and have a [local instance of ClickHouse](/install) running (with default authentication), users can run this configuration via the command: + + ```bash + ./otelcol-contrib --config clickhouse-config.yaml + ``` + + To send trace data to this collector, run the following command using the `telemetrygen` tool: + + ```bash + $GOBIN/telemetrygen traces --otlp-insecure --traces 300 + ``` + + Once running, confirm log events are present with a simple query: + + ```sql + SELECT * + FROM otel_logs + LIMIT 1 + FORMAT Vertical + + Row 1: + ────── + Timestamp: 2019-01-22 06:46:14.000000000 + TraceId: + SpanId: + TraceFlags: 0 + SeverityText: + SeverityNumber: 0 + ServiceName: + Body: {"remote_addr":"109.230.70.66","remote_user":"-","run_time":"0","time_local":"2019-01-22 06:46:14.000","request_type":"GET","request_path":"\/image\/61884\/productModel\/150x150","request_protocol":"HTTP\/1.1","status":"200","size":"1684","referer":"https:\/\/www.zanbil.ir\/filter\/p3%2Cb2","user_agent":"Mozilla\/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko\/20100101 Firefox\/64.0"} + ResourceSchemaUrl: + ResourceAttributes: {} + ScopeSchemaUrl: + ScopeName: + ScopeVersion: + ScopeAttributes: {} + LogAttributes: {'referer':'https://www.zanbil.ir/filter/p3%2Cb2','log.file.name':'access-structured.log','run_time':'0','remote_user':'-','request_protocol':'HTTP/1.1','size':'1684','user_agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0','remote_addr':'109.230.70.66','request_path':'/image/61884/productModel/150x150','status':'200','time_local':'2019-01-22 06:46:14.000','request_type':'GET'} + + 1 row in set. Elapsed: 0.012 sec. Processed 5.04 thousand rows, 4.62 MB (414.14 thousand rows/s., 379.48 MB/s.) + Peak memory usage: 5.41 MiB. + + Likewise, for trace events, users can check the `otel_traces` table: + + SELECT * + FROM otel_traces + LIMIT 1 + FORMAT Vertical + + Row 1: + ────── + Timestamp: 2024-06-20 11:36:41.181398000 + TraceId: 00bba81fbd38a242ebb0c81a8ab85d8f + SpanId: beef91a2c8685ace + ParentSpanId: + TraceState: + SpanName: lets-go + SpanKind: SPAN_KIND_CLIENT + ServiceName: telemetrygen + ResourceAttributes: {'service.name':'telemetrygen'} + ScopeName: telemetrygen + ScopeVersion: + SpanAttributes: {'peer.service':'telemetrygen-server','net.peer.ip':'1.2.3.4'} + Duration: 123000 + StatusCode: STATUS_CODE_UNSET + StatusMessage: + Events.Timestamp: [] + Events.Name: [] + Events.Attributes: [] + Links.TraceId: [] + Links.SpanId: [] + Links.TraceState: [] + Links.Attributes: [] + ``` ## Out of the box schema {#out-of-the-box-schema} @@ -542,9 +539,9 @@ A few important notes on this schema: - Most other types here e.g. `ServiceName` as LowCardinality, are optimized. Note that `Body`, which is JSON in our example logs, is stored as a String. - Bloom filters are applied to map keys and values, as well as the `Body` column. These aim to improve query times for queries accessing these columns but are typically not required. See [Secondary/Data skipping indices](/use-cases/observability/schema-design#secondarydata-skipping-indices). -```sql -CREATE TABLE default.otel_traces -( + ```sql + CREATE TABLE default.otel_traces + ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), @@ -573,17 +570,17 @@ CREATE TABLE default.otel_traces INDEX idx_span_attr_key mapKeys(SpanAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_span_attr_value mapValues(SpanAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_duration Duration TYPE minmax GRANULARITY 1 -) -ENGINE = MergeTree -PARTITION BY toDate(Timestamp) -ORDER BY (ServiceName, SpanName, toUnixTimestamp(Timestamp), TraceId) -TTL toDateTime(Timestamp) + toIntervalDay(3) -SETTINGS ttl_only_drop_parts = 1 -``` + ) + ENGINE = MergeTree + PARTITION BY toDate(Timestamp) + ORDER BY (ServiceName, SpanName, toUnixTimestamp(Timestamp), TraceId) + TTL toDateTime(Timestamp) + toIntervalDay(3) + SETTINGS ttl_only_drop_parts = 1 + ``` -Again, this will correlate with the columns corresponding to OTel official specification for traces documented [here](https://opentelemetry.io/docs/specs/otel/trace/api/). The schema here employs many of the same settings as the above logs schema with additional Link columns specific to spans. + Again, this will correlate with the columns corresponding to OTel official specification for traces documented [here](https://opentelemetry.io/docs/specs/otel/trace/api/). The schema here employs many of the same settings as the above logs schema with additional Link columns specific to spans. -We recommend users disable auto schema creation and create their tables manually. This allows modification of the primary and secondary keys, as well as the opportunity to introduce additional columns for optimizing query performance. For further details see [Schema design](/use-cases/observability/schema-design). + We recommend users disable auto schema creation and create their tables manually. This allows modification of the primary and secondary keys, as well as the opportunity to introduce additional columns for optimizing query performance. For further details see [Schema design](/use-cases/observability/schema-design). ## Optimizing inserts {#optimizing-inserts} @@ -598,9 +595,9 @@ By default, inserts into ClickHouse are synchronous and idempotent if identical. - (1) If the node receiving the data has issues, the insert query will time out (or get a more specific error) and not receive an acknowledgment. - (2) If the data got written by the node, but the acknowledgement can't be returned to the sender of the query because of network interruptions, the sender will either get a time-out or a network error. -From the collector's perspective, (1) and (2) can be hard to distinguish. However, in both cases, the unacknowledged insert can just immediately be retried. As long as the retried insert query contains the same data in the same order, ClickHouse will automatically ignore the retried insert if the (unacknowledged) original insert succeeded. + From the collector's perspective, (1) and (2) can be hard to distinguish. However, in both cases, the unacknowledged insert can just immediately be retried. As long as the retried insert query contains the same data in the same order, ClickHouse will automatically ignore the retried insert if the (unacknowledged) original insert succeeded. -We recommend users use the [batch processor](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md) shown in earlier configurations to satisfy the above. This ensures inserts are sent as consistent batches of rows satisfying the above requirements. If a collector is expected to have high throughput (events per second), and at least 5000 events can be sent in each insert, this is usually the only batching required in the pipeline. In this case the collector will flush batches before the batch processor's `timeout` is reached, ensuring the end-to-end latency of the pipeline remains low and batches are of a consistent size. + We recommend users use the [batch processor](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md) shown in earlier configurations to satisfy the above. This ensures inserts are sent as consistent batches of rows satisfying the above requirements. If a collector is expected to have high throughput (events per second), and at least 5000 events can be sent in each insert, this is usually the only batching required in the pipeline. In this case the collector will flush batches before the batch processor's `timeout` is reached, ensuring the end-to-end latency of the pipeline remains low and batches are of a consistent size. ### Use asynchronous inserts {#use-asynchronous-inserts} @@ -657,31 +654,31 @@ receivers: filelog: include: - /opt/data/logs/access-structured.log - start_at: beginning - operators: + start_at: beginning + operators: - type: json_parser - timestamp: + timestamp: parse_from: attributes.time_local layout: '%Y-%m-%d %H:%M:%S' -processors: - batch: - timeout: 5s - send_batch_size: 1000 -exporters: - otlp: - endpoint: localhost:4317 - tls: - insecure: true # Set to false if you are using a secure connection -service: - telemetry: - metrics: - address: 0.0.0.0:9888 # Modified as 2 collectors running on same host - pipelines: - logs: - receivers: [filelog] - processors: [batch] - exporters: [otlp] -``` + processors: + batch: + timeout: 5s + send_batch_size: 1000 + exporters: + otlp: + endpoint: localhost:4317 + tls: + insecure: true # Set to false if you are using a secure connection + service: + telemetry: + metrics: + address: 0.0.0.0:9888 # Modified as 2 collectors running on same host + pipelines: + logs: + receivers: [filelog] + processors: [batch] + exporters: [otlp] + ``` [clickhouse-gateway-config.yaml](https://www.otelbin.io/#config=receivers%3A*N__otlp%3A*N____protocols%3A*N____grpc%3A*N____endpoint%3A_0.0.0.0%3A4317*N*Nprocessors%3A*N__batch%3A*N____timeout%3A_5s*N____send*_batch*_size%3A_10000*N*Nexporters%3A*N__clickhouse%3A*N____endpoint%3A_tcp%3A%2F%2Flocalhost%3A9000*Qdial*_timeout*E10s*Acompress*Elz4*N____ttl%3A_96h*N____traces*_table*_name%3A_otel*_traces*N____logs*_table*_name%3A_otel*_logs*N____create*_schema%3A_true*N____timeout%3A_10s*N____database%3A_default*N____sending*_queue%3A*N____queue*_size%3A_10000*N____retry*_on*_failure%3A*N____enabled%3A_true*N____initial*_interval%3A_5s*N____max*_interval%3A_30s*N____max*_elapsed*_time%3A_300s*N*Nservice%3A*N__pipelines%3A*N____logs%3A*N______receivers%3A_%5Botlp%5D*N______processors%3A_%5Bbatch%5D*N______exporters%3A_%5Bclickhouse%5D%7E&distro=otelcol-contrib%7E&distroVersion=v0.103.1%7E) diff --git a/docs/use-cases/observability/build-your-own/introduction.md b/docs/use-cases/observability/build-your-own/introduction.md index ef2cacbcb33..1693bbdf204 100644 --- a/docs/use-cases/observability/build-your-own/introduction.md +++ b/docs/use-cases/observability/build-your-own/introduction.md @@ -57,7 +57,7 @@ SQL-based observability is for you if: - You want to be in control of the TCO (total cost of ownership) and avoid spiraling observability costs. - You can't or don't want to get stuck with small data retention periods for your observability data just to manage the costs. -SQL-based observability may not be for you if: + SQL-based observability may not be for you if: - Learning (or generating!) SQL is not appealing to you or your team(s). - You are looking for a packaged, end-to-end observability experience. @@ -73,16 +73,16 @@ We currently recommend ClickHouse for storing two types of observability data: - **Logs** - Logs are time-stamped records of events occurring within a system, capturing detailed information about various aspects of software operations. The data in logs is typically unstructured or semi-structured and can include error messages, user activity logs, system changes, and other events. Logs are crucial for troubleshooting, anomaly detection, and understanding the specific events leading up to issues within the system. -```response -54.36.149.41 - - [22/Jan/2019:03:56:14 +0330] "GET -/filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,27|%DA%A9%D9%85%D8%AA%D8%B1%20%D8%A7%D8%B2%205%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,p53 HTTP/1.1" 200 30577 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)" "-" -``` + ```response + 54.36.149.41 - - [22/Jan/2019:03:56:14 +0330] "GET + /filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,27|%DA%A9%D9%85%D8%AA%D8%B1%20%D8%A7%D8%B2%205%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,p53 HTTP/1.1" 200 30577 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)" "-" + ``` - **Traces** - Traces capture the journey of requests as they traverse through different services in a distributed system, detailing the path and performance of these requests. The data in traces is highly structured, consisting of spans and traces that map out each step a request takes, including timing information. Traces provide valuable insights into system performance, helping identify bottlenecks, latency issues, and optimize the efficiency of microservices. -:::note Metrics -While ClickHouse can be used to store metrics data, this pillar is less mature in ClickHouse with pending support for features such as support for the Prometheus data format and PromQL. -::: + :::note Metrics + While ClickHouse can be used to store metrics data, this pillar is less mature in ClickHouse with pending support for features such as support for the Prometheus data format and PromQL. + ::: ### Distributed tracing {#distributed-tracing} diff --git a/docs/use-cases/observability/build-your-own/managing-data.md b/docs/use-cases/observability/build-your-own/managing-data.md index 840ac333880..d115550a310 100644 --- a/docs/use-cases/observability/build-your-own/managing-data.md +++ b/docs/use-cases/observability/build-your-own/managing-data.md @@ -9,7 +9,6 @@ show_related_blogs: true import observability_14 from '@site/static/images/use-cases/observability/observability-14.png'; import Image from '@theme/IdealImage'; - # Managing data Deployments of ClickHouse for Observability invariably involve large datasets, which need to be managed. ClickHouse offers a number of features to assist with data management. @@ -143,7 +142,6 @@ ORDER BY c DESC This feature is exploited by TTL when the setting [`ttl_only_drop_parts=1`](/operations/settings/merge-tree-settings#ttl_only_drop_parts) is used. See [Data management with TTL](#data-management-with-ttl-time-to-live) for further details. ::: - ### Applications {#applications} The above illustrates how data can be efficiently moved and manipulated by partition. In reality, users will likely most frequently exploit partition operations in Observability use cases for two scenarios: @@ -151,7 +149,7 @@ The above illustrates how data can be efficiently moved and manipulated by parti - **Tiered architectures** - Moving data between storage tiers (see [Storage tiers](#storage-tiers)), thus allowing hot-cold architectures to be constructed. - **Efficient deletion** - when data has reached a specified TTL (see [Data management with TTL](#data-management-with-ttl-time-to-live)) -We explore both of these in detail below. + We explore both of these in detail below. ### Query performance {#query-performance} diff --git a/docs/use-cases/observability/build-your-own/schema-design.md b/docs/use-cases/observability/build-your-own/schema-design.md index bba6f764bfc..50328c94b05 100644 --- a/docs/use-cases/observability/build-your-own/schema-design.md +++ b/docs/use-cases/observability/build-your-own/schema-design.md @@ -23,9 +23,9 @@ We recommend users always create their own schema for logs and traces for the fo - **Secondary indices** - The default schema uses secondary indices for speeding up access to Maps and accelerating text queries. These are typically not required and incur additional disk space. They can be used but should be tested to ensure they are required. See ["Secondary / Data Skipping indices"](#secondarydata-skipping-indices). - **Using Codecs** - Users may wish to customize codecs for columns if they understand the anticipated data and have evidence this improves compression. -_We describe each of the above use cases in detail below._ + _We describe each of the above use cases in detail below._ -**Important:** While users are encouraged to extend and modify their schema to achieve optimal compression and query performance, they should adhere to the OTel schema naming for core columns where possible. The ClickHouse Grafana plugin assumes the existence of some basic OTel columns to assist with query building e.g. Timestamp and SeverityText. The required columns for logs and traces are documented here [[1]](https://grafana.com/developers/plugin-tools/tutorials/build-a-logs-data-source-plugin#logs-data-frame-format)[[2]](https://grafana.com/docs/grafana/latest/explore/logs-integration/) and [here](https://grafana.com/docs/grafana/latest/explore/trace-integration/#data-frame-structure), respectively. You can choose to change these column names, overriding the defaults in the plugin configuration. + **Important:** While users are encouraged to extend and modify their schema to achieve optimal compression and query performance, they should adhere to the OTel schema naming for core columns where possible. The ClickHouse Grafana plugin assumes the existence of some basic OTel columns to assist with query building e.g. Timestamp and SeverityText. The required columns for logs and traces are documented here [[1]](https://grafana.com/developers/plugin-tools/tutorials/build-a-logs-data-source-plugin#logs-data-frame-format)[[2]](https://grafana.com/docs/grafana/latest/explore/logs-integration/) and [here](https://grafana.com/docs/grafana/latest/explore/trace-integration/#data-frame-structure), respectively. You can choose to change these column names, overriding the defaults in the plugin configuration. ## Extracting structure with SQL {#extracting-structure-with-sql} @@ -34,124 +34,124 @@ Whether ingesting structured or unstructured logs, users often need the ability - **Extract columns from string blobs**. Querying these will be faster than using string operations at query time. - **Extract keys from maps**. The default schema places arbitrary attributes into columns of the Map type. This type provides a schema-less capability that has the advantage of users not needing to pre-define the columns for attributes when defining logs and traces - often, this is impossible when collecting logs from Kubernetes and wanting to ensure pod labels are retained for later search. Accessing map keys and their values is slower than querying on normal ClickHouse columns. Extracting keys from maps to root table columns is, therefore, often desirable. -Consider the following queries: - -Suppose we wish to count which URL paths receive the most POST requests using the structured logs. The JSON blob is stored within the `Body` column as a String. Additionally, it may also be stored in the `LogAttributes` column as a `Map(String, String)` if the user has enabled the json_parser in the collector. - -```sql -SELECT LogAttributes -FROM otel_logs -LIMIT 1 -FORMAT Vertical - -Row 1: -────── -Body: {"remote_addr":"54.36.149.41","remote_user":"-","run_time":"0","time_local":"2019-01-22 00:26:14.000","request_type":"GET","request_path":"\/filter\/27|13 ,27| 5 ,p53","request_protocol":"HTTP\/1.1","status":"200","size":"30577","referer":"-","user_agent":"Mozilla\/5.0 (compatible; AhrefsBot\/6.1; +http:\/\/ahrefs.com\/robot\/)"} -LogAttributes: {'status':'200','log.file.name':'access-structured.log','request_protocol':'HTTP/1.1','run_time':'0','time_local':'2019-01-22 00:26:14.000','size':'30577','user_agent':'Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)','referer':'-','remote_user':'-','request_type':'GET','request_path':'/filter/27|13 ,27| 5 ,p53','remote_addr':'54.36.149.41'} -``` - -Assuming the `LogAttributes` is available, the query to count which URL paths of the site receive the most POST requests: - -```sql -SELECT path(LogAttributes['request_path']) AS path, count() AS c -FROM otel_logs -WHERE ((LogAttributes['request_type']) = 'POST') -GROUP BY path -ORDER BY c DESC -LIMIT 5 - -┌─path─────────────────────┬─────c─┐ -│ /m/updateVariation │ 12182 │ -│ /site/productCard │ 11080 │ -│ /site/productPrice │ 10876 │ -│ /site/productModelImages │ 10866 │ -│ /site/productAdditives │ 10866 │ -└──────────────────────────┴───────┘ - -5 rows in set. Elapsed: 0.735 sec. Processed 10.36 million rows, 4.65 GB (14.10 million rows/s., 6.32 GB/s.) -Peak memory usage: 153.71 MiB. -``` - -Note the use of the map syntax here e.g. `LogAttributes['request_path']`, and the [`path` function](/sql-reference/functions/url-functions#path) for stripping query parameters from the URL. - -If the user has not enabled JSON parsing in the collector, then `LogAttributes` will be empty, forcing us to use [JSON functions](/sql-reference/functions/json-functions) to extract the columns from the String `Body`. - -:::note Prefer ClickHouse for parsing -We generally recommend users perform JSON parsing in ClickHouse of structured logs. We are confident ClickHouse is the fastest JSON parsing implementation. However, we recognize users may wish to send logs to other sources and not have this logic reside in SQL. -::: - -```sql -SELECT path(JSONExtractString(Body, 'request_path')) AS path, count() AS c -FROM otel_logs -WHERE JSONExtractString(Body, 'request_type') = 'POST' -GROUP BY path -ORDER BY c DESC -LIMIT 5 - -┌─path─────────────────────┬─────c─┐ -│ /m/updateVariation │ 12182 │ -│ /site/productCard │ 11080 │ -│ /site/productPrice │ 10876 │ -│ /site/productAdditives │ 10866 │ -│ /site/productModelImages │ 10866 │ -└──────────────────────────┴───────┘ - -5 rows in set. Elapsed: 0.668 sec. Processed 10.37 million rows, 5.13 GB (15.52 million rows/s., 7.68 GB/s.) -Peak memory usage: 172.30 MiB. -``` - -Now consider the same for unstructured logs: - -```sql -SELECT Body, LogAttributes -FROM otel_logs -LIMIT 1 -FORMAT Vertical - -Row 1: -────── -Body: 151.233.185.144 - - [22/Jan/2019:19:08:54 +0330] "GET /image/105/brand HTTP/1.1" 200 2653 "https://www.zanbil.ir/filter/b43,p56" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" "-" -LogAttributes: {'log.file.name':'access-unstructured.log'} -``` - -A similar query for the unstructured logs requires the use of regular expressions via the [`extractAllGroupsVertical` function](/sql-reference/functions/string-search-functions#extractallgroupsvertical). - -```sql -SELECT + Consider the following queries: + + Suppose we wish to count which URL paths receive the most POST requests using the structured logs. The JSON blob is stored within the `Body` column as a String. Additionally, it may also be stored in the `LogAttributes` column as a `Map(String, String)` if the user has enabled the json_parser in the collector. + + ```sql + SELECT LogAttributes + FROM otel_logs + LIMIT 1 + FORMAT Vertical + + Row 1: + ────── + Body: {"remote_addr":"54.36.149.41","remote_user":"-","run_time":"0","time_local":"2019-01-22 00:26:14.000","request_type":"GET","request_path":"\/filter\/27|13 ,27| 5 ,p53","request_protocol":"HTTP\/1.1","status":"200","size":"30577","referer":"-","user_agent":"Mozilla\/5.0 (compatible; AhrefsBot\/6.1; +http:\/\/ahrefs.com\/robot\/)"} + LogAttributes: {'status':'200','log.file.name':'access-structured.log','request_protocol':'HTTP/1.1','run_time':'0','time_local':'2019-01-22 00:26:14.000','size':'30577','user_agent':'Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)','referer':'-','remote_user':'-','request_type':'GET','request_path':'/filter/27|13 ,27| 5 ,p53','remote_addr':'54.36.149.41'} + ``` + + Assuming the `LogAttributes` is available, the query to count which URL paths of the site receive the most POST requests: + + ```sql + SELECT path(LogAttributes['request_path']) AS path, count() AS c + FROM otel_logs + WHERE ((LogAttributes['request_type']) = 'POST') + GROUP BY path + ORDER BY c DESC + LIMIT 5 + + ┌─path─────────────────────┬─────c─┐ + │ /m/updateVariation │ 12182 │ + │ /site/productCard │ 11080 │ + │ /site/productPrice │ 10876 │ + │ /site/productModelImages │ 10866 │ + │ /site/productAdditives │ 10866 │ + └──────────────────────────┴───────┘ + + 5 rows in set. Elapsed: 0.735 sec. Processed 10.36 million rows, 4.65 GB (14.10 million rows/s., 6.32 GB/s.) + Peak memory usage: 153.71 MiB. + ``` + + Note the use of the map syntax here e.g. `LogAttributes['request_path']`, and the [`path` function](/sql-reference/functions/url-functions#path) for stripping query parameters from the URL. + + If the user has not enabled JSON parsing in the collector, then `LogAttributes` will be empty, forcing us to use [JSON functions](/sql-reference/functions/json-functions) to extract the columns from the String `Body`. + + :::note Prefer ClickHouse for parsing + We generally recommend users perform JSON parsing in ClickHouse of structured logs. We are confident ClickHouse is the fastest JSON parsing implementation. However, we recognize users may wish to send logs to other sources and not have this logic reside in SQL. + ::: + + ```sql + SELECT path(JSONExtractString(Body, 'request_path')) AS path, count() AS c + FROM otel_logs + WHERE JSONExtractString(Body, 'request_type') = 'POST' + GROUP BY path + ORDER BY c DESC + LIMIT 5 + + ┌─path─────────────────────┬─────c─┐ + │ /m/updateVariation │ 12182 │ + │ /site/productCard │ 11080 │ + │ /site/productPrice │ 10876 │ + │ /site/productAdditives │ 10866 │ + │ /site/productModelImages │ 10866 │ + └──────────────────────────┴───────┘ + + 5 rows in set. Elapsed: 0.668 sec. Processed 10.37 million rows, 5.13 GB (15.52 million rows/s., 7.68 GB/s.) + Peak memory usage: 172.30 MiB. + ``` + + Now consider the same for unstructured logs: + + ```sql + SELECT Body, LogAttributes + FROM otel_logs + LIMIT 1 + FORMAT Vertical + + Row 1: + ────── + Body: 151.233.185.144 - - [22/Jan/2019:19:08:54 +0330] "GET /image/105/brand HTTP/1.1" 200 2653 "https://www.zanbil.ir/filter/b43,p56" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" "-" + LogAttributes: {'log.file.name':'access-unstructured.log'} + ``` + + A similar query for the unstructured logs requires the use of regular expressions via the [`extractAllGroupsVertical` function](/sql-reference/functions/string-search-functions#extractallgroupsvertical). + + ```sql + SELECT path((groups[1])[2]) AS path, count() AS c -FROM -( + FROM + ( SELECT extractAllGroupsVertical(Body, '(\\w+)\\s([^\\s]+)\\sHTTP/\\d\\.\\d') AS groups FROM otel_logs WHERE ((groups[1])[1]) = 'POST' -) -GROUP BY path -ORDER BY c DESC -LIMIT 5 + ) + GROUP BY path + ORDER BY c DESC + LIMIT 5 -┌─path─────────────────────┬─────c─┐ -│ /m/updateVariation │ 12182 │ -│ /site/productCard │ 11080 │ -│ /site/productPrice │ 10876 │ -│ /site/productModelImages │ 10866 │ -│ /site/productAdditives │ 10866 │ -└──────────────────────────┴───────┘ + ┌─path─────────────────────┬─────c─┐ + │ /m/updateVariation │ 12182 │ + │ /site/productCard │ 11080 │ + │ /site/productPrice │ 10876 │ + │ /site/productModelImages │ 10866 │ + │ /site/productAdditives │ 10866 │ + └──────────────────────────┴───────┘ -5 rows in set. Elapsed: 1.953 sec. Processed 10.37 million rows, 3.59 GB (5.31 million rows/s., 1.84 GB/s.) -``` + 5 rows in set. Elapsed: 1.953 sec. Processed 10.37 million rows, 3.59 GB (5.31 million rows/s., 1.84 GB/s.) + ``` -The increased complexity and cost of queries for parsing unstructured logs (notice performance difference) is why we recommend users always use structured logs where possible. + The increased complexity and cost of queries for parsing unstructured logs (notice performance difference) is why we recommend users always use structured logs where possible. -:::note Consider dictionaries -The above query could be optimized to exploit regular expression dictionaries. See [Using Dictionaries](#using-dictionaries) for more detail. -::: + :::note Consider dictionaries + The above query could be optimized to exploit regular expression dictionaries. See [Using Dictionaries](#using-dictionaries) for more detail. + ::: -Both of these use cases can be satisfied using ClickHouse by moving the above query logic to insert time. We explore several approaches below, highlighting when each is appropriate. + Both of these use cases can be satisfied using ClickHouse by moving the above query logic to insert time. We explore several approaches below, highlighting when each is appropriate. -:::note OTel or ClickHouse for processing? -Users may also perform processing using OTel Collector processors and operators as described [here](/observability/integrating-opentelemetry#processing---filtering-transforming-and-enriching). In most cases, users will find ClickHouse is significantly more resource-efficient and faster than the collector's processors. The principal downside of performing all event processing in SQL is the coupling of your solution to ClickHouse. For example, users may wish to send processed logs to alternative destinations from the OTel collector e.g. S3. -::: + :::note OTel or ClickHouse for processing? + Users may also perform processing using OTel Collector processors and operators as described [here](/observability/integrating-opentelemetry#processing---filtering-transforming-and-enriching). In most cases, users will find ClickHouse is significantly more resource-efficient and faster than the collector's processors. The principal downside of performing all event processing in SQL is the coupling of your solution to ClickHouse. For example, users may wish to send processed logs to alternative destinations from the OTel collector e.g. S3. + ::: ### Materialized columns {#materialized-columns} @@ -161,7 +161,6 @@ Materialized columns offer the simplest solution to extract structure from other Materialized columns incur additional storage overhead as the values are extracted to new columns on disk at insert time. ::: - Materialized columns support any ClickHouse expression and can exploit any of the analytical functions for [processing strings](/sql-reference/functions/string-functions) (including [regex and searching](/sql-reference/functions/string-search-functions)) and [urls](/sql-reference/functions/url-functions), performing [type conversions](/sql-reference/functions/type-conversion-functions), [extracting values from JSON](/sql-reference/functions/json-functions) or [mathematical operations](/sql-reference/functions/math-functions). We recommend materialized columns for basic processing. They are especially useful for extracting values from maps, promoting them to root columns, and performing type conversions. They are often most useful when used in very basic schemas or in conjunction with materialized views. Consider the following schema for logs from which the JSON has been extracted to the `LogAttributes` column by the collector: @@ -233,7 +232,6 @@ Materialized Views allow users to shift the cost of computation from query time Materialized views in ClickHouse are updated in real time as data flows into the table they are based on, functioning more like continually updating indexes. In contrast, in other databases materialized views are typically static snapshots of a query that must be refreshed (similar to ClickHouse Refreshable Materialized Views). ::: - The query associated with the materialized view can theoretically be any query, including an aggregation although [limitations exist with Joins](https://clickhouse.com/blog/using-materialized-views-in-clickhouse#materialized-views-and-joins). For the transformations and filtering workloads required for logs and traces, users can consider any `SELECT` statement to be possible. Users should remember the query is just a trigger executing over the rows being inserted into a table (the source table), with the results sent to a new table (the target table). @@ -267,7 +265,7 @@ Consider the following query. This transforms our rows into a format we wish to ```sql SELECT - Body, + Body, Timestamp::DateTime AS Timestamp, ServiceName, LogAttributes['status'] AS Status, @@ -350,13 +348,12 @@ The types selected here are based on optimizations discussed in ["Optimizing typ Notice how we have dramatically changed our schema. In reality users will likely also have Trace columns they will want to preserve as well as the column `ResourceAttributes` (this usually contains Kubernetes metadata). Grafana can exploit trace columns to provide linking functionality between logs and traces - see ["Using Grafana"](/observability/grafana). ::: - Below, we create a materialized view `otel_logs_mv`, which executes the above select for the `otel_logs` table and sends the results to `otel_logs_v2`. ```sql CREATE MATERIALIZED VIEW otel_logs_mv TO otel_logs_v2 AS SELECT - Body, + Body, Timestamp::DateTime AS Timestamp, ServiceName, LogAttributes['status']::UInt16 AS Status, @@ -415,7 +412,7 @@ An equivalent Materialized view, which relies on extracting columns from the `Bo ```sql CREATE MATERIALIZED VIEW otel_logs_mv TO otel_logs_v2 AS -SELECT Body, +SELECT Body, Timestamp::DateTime AS Timestamp, ServiceName, JSONExtractUInt(Body, 'status') AS Status, @@ -443,9 +440,9 @@ The above materialized views rely on implicit casting - especially in the case o - Some types will not always be cast e.g. string representations of numerics will not be cast to enum values. - JSON extract functions return default values for their type if a value is not found. Ensure these values make sense! -:::note Avoid Nullable -Avoid using [Nullable](/sql-reference/data-types/nullable) in Clickhouse for Observability data. It is rarely required in logs and traces to be able to distinguish between empty and null. This feature incurs an additional storage overhead and will negatively impact query performance. See [here](/data-modeling/schema-design#optimizing-types) for further details. -::: + :::note Avoid Nullable + Avoid using [Nullable](/sql-reference/data-types/nullable) in Clickhouse for Observability data. It is rarely required in logs and traces to be able to distinguish between empty and null. This feature incurs an additional storage overhead and will negatively impact query performance. See [here](/data-modeling/schema-design#optimizing-types) for further details. + ::: ## Choosing a primary (ordering) key {#choosing-a-primary-ordering-key} @@ -458,13 +455,11 @@ Some simple rules can be applied to help choose an ordering key. The following c 3. Prefer columns that are likely to be highly correlated with other columns in the table. This will help ensure these values are also stored contiguously, improving compression. 4. `GROUP BY` and `ORDER BY` operations for columns in the ordering key can be made more memory efficient. -
    + On identifying the subset of columns for the ordering key, they must be declared in a specific order. This order can significantly influence both the efficiency of the filtering on secondary key columns in queries and the compression ratio for the table's data files. In general, it is **best to order the keys in ascending order of cardinality**. This should be balanced against the fact that filtering on columns that appear later in the ordering key will be less efficient than filtering on those that appear earlier in the tuple. Balance these behaviors and consider your access patterns. Most importantly, test variants. For further understanding of ordering keys and how to optimize them, we recommend [this article](/guides/best-practices/sparse-primary-indexes). -On identifying the subset of columns for the ordering key, they must be declared in a specific order. This order can significantly influence both the efficiency of the filtering on secondary key columns in queries and the compression ratio for the table's data files. In general, it is **best to order the keys in ascending order of cardinality**. This should be balanced against the fact that filtering on columns that appear later in the ordering key will be less efficient than filtering on those that appear earlier in the tuple. Balance these behaviors and consider your access patterns. Most importantly, test variants. For further understanding of ordering keys and how to optimize them, we recommend [this article](/guides/best-practices/sparse-primary-indexes). - -:::note Structure first -We recommend deciding on your ordering keys once you have structured your logs. Do not use keys in attribute maps for the ordering key or JSON extraction expressions. Ensure you have your ordering keys as root columns in your table. -::: + :::note Structure first + We recommend deciding on your ordering keys once you have structured your logs. Do not use keys in attribute maps for the ordering key or JSON extraction expressions. Ensure you have your ordering keys as root columns in your table. + ::: ## Using maps {#using-maps} @@ -489,7 +484,6 @@ Peak memory usage: 71.90 MiB. We don't recommend using dots in Map column names and may deprecate its use. Use an `_`. ::: - ## Using aliases {#using-aliases} Querying map types is slower than querying normal columns - see ["Accelerating queries"](#accelerating-queries). In addition, it's more syntactically complicated and can be cumbersome for users to write. To address this latter issue we recommend using Alias columns. @@ -592,7 +586,6 @@ While joins are rarely required in Observability use cases, dictionaries can sti Users interested in accelerating joins with dictionaries can find further details [here](/dictionary). ::: - ### Insert time vs query time {#insert-time-vs-query-time} Dictionaries can be used for enriching datasets at query time or insert time. Each of these approaches have their respective pros and cons. In summary: @@ -600,9 +593,9 @@ Dictionaries can be used for enriching datasets at query time or insert time. Ea - **Insert time** - This is typically appropriate if the enrichment value does not change and exists in an external source which can be used to populate the dictionary. In this case, enriching the row at insert time avoids the query time lookup to the dictionary. This comes at the cost of insert performance as well as an additional storage overhead, as enriched values will be stored as columns. - **Query time** - If values in a dictionary change frequently, query time lookups are often more applicable. This avoids needing to update columns (and rewrite data) if mapped values change. This flexibility comes at the expense of a query time lookup cost. This query time cost is typically appreciable if a lookup is required for many rows, e.g. using a dictionary lookup in a filter clause. For result enrichment, i.e. in the `SELECT`, this overhead is typically not appreciable. -We recommend that users familiarize themselves with the basics of dictionaries. Dictionaries provide an in-memory lookup table from which values can be retrieved using dedicated [specialist functions](/sql-reference/functions/ext-dict-functions#dictgetall). + We recommend that users familiarize themselves with the basics of dictionaries. Dictionaries provide an in-memory lookup table from which values can be retrieved using dedicated [specialist functions](/sql-reference/functions/ext-dict-functions#dictgetall). -For simple enrichment examples see the guide on Dictionaries [here](/dictionary). Below, we focus on common observability enrichment tasks. + For simple enrichment examples see the guide on Dictionaries [here](/dictionary). Below, we focus on common observability enrichment tasks. ### Using IP dictionaries {#using-ip-dictionaries} @@ -673,7 +666,7 @@ WITH SELECT ip_range_start, ip_range_end, - concat(toString(cidr_address),'/',toString(cidr_suffix)) AS cidr + concat(toString(cidr_address),'/',toString(cidr_suffix)) AS cidr FROM geoip_url LIMIT 4; @@ -715,7 +708,7 @@ SELECT concat(toString(cidr_address),'/',toString(cidr_suffix)) as cidr, latitude, longitude, - country_code + country_code FROM geoip_url ``` @@ -1106,51 +1099,51 @@ Since the merging of rows is asynchronous, there may be more than one row per ho - Use the [`FINAL` modifier](/sql-reference/statements/select/from#final-modifier) on the table name (which we did for the count query above). - Aggregate by the ordering key used in our final table i.e. Timestamp and sum the metrics. -Typically, the second option is more efficient and flexible (the table can be used for other things), but the first can be simpler for some queries. We show both below: + Typically, the second option is more efficient and flexible (the table can be used for other things), but the first can be simpler for some queries. We show both below: -```sql -SELECT + ```sql + SELECT Hour, sum(TotalBytes) AS TotalBytes -FROM bytes_per_hour -GROUP BY Hour -ORDER BY Hour DESC -LIMIT 5 - -┌────────────────Hour─┬─TotalBytes─┐ -│ 2019-01-26 16:00:00 │ 1661716343 │ -│ 2019-01-26 15:00:00 │ 1824015281 │ -│ 2019-01-26 14:00:00 │ 1506284139 │ -│ 2019-01-26 13:00:00 │ 1580955392 │ -│ 2019-01-26 12:00:00 │ 1736840933 │ -└─────────────────────┴────────────┘ - -5 rows in set. Elapsed: 0.008 sec. - -SELECT + FROM bytes_per_hour + GROUP BY Hour + ORDER BY Hour DESC + LIMIT 5 + + ┌────────────────Hour─┬─TotalBytes─┐ + │ 2019-01-26 16:00:00 │ 1661716343 │ + │ 2019-01-26 15:00:00 │ 1824015281 │ + │ 2019-01-26 14:00:00 │ 1506284139 │ + │ 2019-01-26 13:00:00 │ 1580955392 │ + │ 2019-01-26 12:00:00 │ 1736840933 │ + └─────────────────────┴────────────┘ + + 5 rows in set. Elapsed: 0.008 sec. + + SELECT Hour, TotalBytes -FROM bytes_per_hour -FINAL -ORDER BY Hour DESC -LIMIT 5 + FROM bytes_per_hour + FINAL + ORDER BY Hour DESC + LIMIT 5 -┌────────────────Hour─┬─TotalBytes─┐ -│ 2019-01-26 16:00:00 │ 1661716343 │ -│ 2019-01-26 15:00:00 │ 1824015281 │ -│ 2019-01-26 14:00:00 │ 1506284139 │ -│ 2019-01-26 13:00:00 │ 1580955392 │ -│ 2019-01-26 12:00:00 │ 1736840933 │ -└─────────────────────┴────────────┘ + ┌────────────────Hour─┬─TotalBytes─┐ + │ 2019-01-26 16:00:00 │ 1661716343 │ + │ 2019-01-26 15:00:00 │ 1824015281 │ + │ 2019-01-26 14:00:00 │ 1506284139 │ + │ 2019-01-26 13:00:00 │ 1580955392 │ + │ 2019-01-26 12:00:00 │ 1736840933 │ + └─────────────────────┴────────────┘ -5 rows in set. Elapsed: 0.005 sec. -``` + 5 rows in set. Elapsed: 0.005 sec. + ``` -This has sped up our query from 0.6s to 0.008s - over 75 times! + This has sped up our query from 0.6s to 0.008s - over 75 times! -:::note -These savings can be even greater on larger datasets with more complex queries. See [here](https://github.com/ClickHouse/clickpy) for examples. -::: + :::note + These savings can be even greater on larger datasets with more complex queries. See [here](https://github.com/ClickHouse/clickpy) for examples. + ::: #### A more complex example {#a-more-complex-example} @@ -1286,7 +1279,6 @@ CREATE TABLE otel_traces_trace_id_ts ENGINE = MergeTree ORDER BY (TraceId, toUnixTimestamp(Start)) - CREATE MATERIALIZED VIEW otel_traces_trace_id_ts_mv TO otel_traces_trace_id_ts ( `TraceId` String, @@ -1573,7 +1565,6 @@ WHERE Referer LIKE '%ultra%' 10 rows in set. Elapsed: 0.016 sec. - EXPLAIN indexes = 1 SELECT count() FROM otel_logs_bloom @@ -1586,7 +1577,7 @@ WHERE Referer LIKE '%ultra%' │ Filter ((WHERE + Change column names to column identifiers)) │ │ ReadFromMergeTree (default.otel_logs_bloom) │ │ Indexes: │ -│ PrimaryKey │ +│ PrimaryKey │ │ Condition: true │ │ Parts: 8/8 │ │ Granules: 1276/1276 │ @@ -1617,7 +1608,6 @@ ORDER BY sum(data_compressed_bytes) DESC 1 row in set. Elapsed: 0.018 sec. - SELECT `table`, formatReadableSize(data_compressed_bytes) AS compressed_size, diff --git a/docs/use-cases/observability/clickstack/alerts.md b/docs/use-cases/observability/clickstack/alerts.md index 63c1249fcc2..7096d293aee 100644 --- a/docs/use-cases/observability/clickstack/alerts.md +++ b/docs/use-cases/observability/clickstack/alerts.md @@ -10,7 +10,6 @@ description: 'Alerts with ClickStack' import Image from '@theme/IdealImage'; import search_alert from '@site/static/images/use-cases/observability/search_alert.png'; - ## Search alerts {#search-alerts} After entering a [search](/use-cases/observability/clickstack/search), you can create an alert to be @@ -18,7 +17,7 @@ notified when the number of events (logs or spans) matching the search exceeds o ### Creating an alert {#creating-an-alert} -You can create an alert by clicking the `Alerts` button on the top right of the `Search` page. +You can create an alert by clicking the `Alerts` button on the top right of the `Search` page. From here, you can name the alert, as well as set the threshold, duration, and notification method for the alert (Slack, Email, PagerDuty or Slack webhook). diff --git a/docs/use-cases/observability/clickstack/architecture.md b/docs/use-cases/observability/clickstack/architecture.md index f1e00cce062..276ebbd4b83 100644 --- a/docs/use-cases/observability/clickstack/architecture.md +++ b/docs/use-cases/observability/clickstack/architecture.md @@ -9,7 +9,6 @@ title: 'Architecture' import Image from '@theme/IdealImage'; import architecture from '@site/static/images/use-cases/observability/clickstack-architecture.png'; - The ClickStack architecture is built around three core components: **ClickHouse**, **HyperDX**, and a **OpenTelemetry (OTel) collector**. A **MongoDB** instance provides storage for the application state. Together, they provide a high-performance, open-source observability stack optimized for logs, metrics, and traces. ## Architecture overview {#architecture-overview} @@ -26,7 +25,7 @@ At the heart of ClickStack is ClickHouse, a column-oriented database designed fo - Native support for semi-structured JSON data, allowing dynamic schema evolution - A powerful SQL engine with hundreds of built-in analytical functions -ClickHouse handles observability data as wide events, allowing for deep correlation across logs, metrics, and traces in a single unified structure. + ClickHouse handles observability data as wide events, allowing for deep correlation across logs, metrics, and traces in a single unified structure. ## OpenTelemetry collector: data ingestion {#open-telemetry-collector} @@ -35,7 +34,7 @@ ClickStack includes a pre-configured OpenTelemetry (OTel) collector to ingest te - gRPC (port `4317`) - HTTP (port `4318`) -The collector exports telemetry to ClickHouse in efficient batches. It supports optimized table schemas per data source, ensuring scalable performance across all signal types. + The collector exports telemetry to ClickHouse in efficient batches. It supports optimized table schemas per data source, ensuring scalable performance across all signal types. ## HyperDX: the interface {#hyperdx} @@ -48,7 +47,7 @@ HyperDX is the user interface for ClickStack. It offers: - Dashboard creation and alert configuration - SQL query interface for advanced analysis -Designed specifically for ClickHouse, HyperDX combines powerful search with intuitive workflows, enabling users to spot anomalies, investigate issues, and gain insights fast. + Designed specifically for ClickHouse, HyperDX combines powerful search with intuitive workflows, enabling users to spot anomalies, investigate issues, and gain insights fast. ## MongoDB: application state {#mongo} @@ -59,6 +58,6 @@ ClickStack uses MongoDB to store application-level state, including: - User profiles - Saved visualizations -This separation of state from event data ensures performance and scalability while simplifying backup and configuration. + This separation of state from event data ensures performance and scalability while simplifying backup and configuration. -This modular architecture enables ClickStack to deliver an out-of-the-box observability platform that is fast, flexible, and open-source. + This modular architecture enables ClickStack to deliver an out-of-the-box observability platform that is fast, flexible, and open-source. diff --git a/docs/use-cases/observability/clickstack/config.md b/docs/use-cases/observability/clickstack/config.md index 8ba464723ab..4a4c102c927 100644 --- a/docs/use-cases/observability/clickstack/config.md +++ b/docs/use-cases/observability/clickstack/config.md @@ -83,13 +83,13 @@ ingress: kubernetes.io/ingress.class: nginx hosts: - host: hyperdx.example.com - paths: + paths: - path: / pathType: ImplementationSpecific - env: + env: - name: CLICKHOUSE_USER - value: abc -``` + value: abc + ``` ## HyperDX {#hyperdx} @@ -102,19 +102,19 @@ HyperDX relies on the user defining a source for each of the Observability data - `Metrics` - `Sessions` -This configuration can be performed inside the application from `Team Settings -> Sources`, as shown below for logs: + This configuration can be performed inside the application from `Team Settings -> Sources`, as shown below for logs: -HyperDX Source configuration + HyperDX Source configuration -Each of these sources require at least one table specified on creation as well as a set of columns which allow HyperDX to query the data. + Each of these sources require at least one table specified on creation as well as a set of columns which allow HyperDX to query the data. -If using the [default OpenTelemetry (OTel) schema](/observability/integrating-opentelemetry#out-of-the-box-schema) distributed with ClickStack, these columns can be automatically inferred for each of the sources. If [modifying the schema](#clickhouse) or using a custom schema, users are required to specify and update these mappings. + If using the [default OpenTelemetry (OTel) schema](/observability/integrating-opentelemetry#out-of-the-box-schema) distributed with ClickStack, these columns can be automatically inferred for each of the sources. If [modifying the schema](#clickhouse) or using a custom schema, users are required to specify and update these mappings. -:::note -The default schema for ClickHouse distributed with ClickStack is the schema created by the [ClickHouse exporter for the OTel collector](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/clickhouseexporter). These column names correlate with the OTel official specification documented [here](https://opentelemetry.io/docs/specs/otel/logs/data-model/). -::: + :::note + The default schema for ClickHouse distributed with ClickStack is the schema created by the [ClickHouse exporter for the OTel collector](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/clickhouseexporter). These column names correlate with the OTel official specification documented [here](https://opentelemetry.io/docs/specs/otel/logs/data-model/). + ::: -The following settings are available for each source: + The following settings are available for each source: #### Logs {#logs} @@ -167,8 +167,6 @@ The following settings are available for each source: | `Span Events Expression` | Expression to extract span events. Typically a `Nested` type column. This allows rendering of exception stack traces with supported language SDKs. | Yes | Yes | `Events` | | `Implicit Column Expression` | Column used for full-text search if no field is specified (Lucene-style). Typically the log body. | Yes | Yes | `SpanName`| - - #### Metrics {#metrics} | Setting | Description | Required | Inferred in Default Schema | Inferred Value | @@ -205,18 +203,18 @@ To enable full cross-source correlation in ClickStack, users must configure corr - `Metrics`: Can be correlated with logs. - `Sessions`: Can be correlated with traces. -By setting these correlations, HyperDX can, for example, render relevant logs alongside a trace or surface metric anomalies linked to a session. Proper configuration ensures a unified and contextual observability experience. + By setting these correlations, HyperDX can, for example, render relevant logs alongside a trace or surface metric anomalies linked to a session. Proper configuration ensures a unified and contextual observability experience. -For example, below is the Logs source configured with correlated sources: + For example, below is the Logs source configured with correlated sources: -HyperDX Source correlated + HyperDX Source correlated ### Application configuration settings {#application-configuration-settings} - `HYPERDX_API_KEY` - **Default:** None (required) - **Description:** Authentication key for the HyperDX API. - - **Guidance:** + - **Guidance:** - Required for telemetry and logging - In local development, can be any non-empty value - For production, use a secure, unique key @@ -342,52 +340,51 @@ For example, below is the Logs source configured with correlated sources: - **Guidance:** - Set to `true` to enable JSON support in ClickStack. - ## OpenTelemetry collector {#otel-collector} See ["ClickStack OpenTelemetry Collector"](/use-cases/observability/clickstack/ingesting-data/otel-collector) for more details. -- `CLICKHOUSE_ENDPOINT` - - **Default:** *None (required)* if standalone image. If All-in-one or Docker Compose distribution this is set to the integrated ClickHouse instance. - - **Description:** The HTTPS URL of the ClickHouse instance to export telemetry data to. - - **Guidance:** - - Must be a full HTTPS endpoint including port (e.g., `https://clickhouse.example.com:8443`) - - Required for the collector to send data to ClickHouse - -- `CLICKHOUSE_USER` - - **Default:** `default` - - **Description:** Username used to authenticate with the ClickHouse instance. - - **Guidance:** - - Ensure the user has `INSERT` and `CREATE TABLE` permissions - - Recommended to create a dedicated user for ingestion - -- `CLICKHOUSE_PASSWORD` - - **Default:** *None (required if authentication is enabled)* - - **Description:** Password for the specified ClickHouse user. - - **Guidance:** - - Required if the user account has a password set - - Store securely via secrets in production deployments - -- `HYPERDX_LOG_LEVEL` - - **Default:** `info` - - **Description:** Log verbosity level for the collector. - - **Guidance:** - - Accepts values like `debug`, `info`, `warn`, `error` - - Use `debug` during troubleshooting - -- `OPAMP_SERVER_URL` - - **Default:** *None (required)* if standalone image. If All-in-one or Docker Compose distribution this points to the deployed HyperDX instance. - - **Description:** URL of the OpAMP server used to manage the collector (e.g., HyperDX instance). This is port `4320` by default. - - **Guidance:** - - Must point to your HyperDX instance - - Enables dynamic configuration and secure ingestion - -- `HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE` - - **Default:** `default` - - **Description:** ClickHouse database the collector writes telemetry data to. - - **Guidance:** - - Set if using a custom database name - - Ensure the specified user has access to this database +- `CLICKHOUSE_ENDPOINT` + - **Default:** *None (required)* if standalone image. If All-in-one or Docker Compose distribution this is set to the integrated ClickHouse instance. + - **Description:** The HTTPS URL of the ClickHouse instance to export telemetry data to. + - **Guidance:** + - Must be a full HTTPS endpoint including port (e.g., `https://clickhouse.example.com:8443`) + - Required for the collector to send data to ClickHouse + +- `CLICKHOUSE_USER` + - **Default:** `default` + - **Description:** Username used to authenticate with the ClickHouse instance. + - **Guidance:** + - Ensure the user has `INSERT` and `CREATE TABLE` permissions + - Recommended to create a dedicated user for ingestion + +- `CLICKHOUSE_PASSWORD` + - **Default:** *None (required if authentication is enabled)* + - **Description:** Password for the specified ClickHouse user. + - **Guidance:** + - Required if the user account has a password set + - Store securely via secrets in production deployments + +- `HYPERDX_LOG_LEVEL` + - **Default:** `info` + - **Description:** Log verbosity level for the collector. + - **Guidance:** + - Accepts values like `debug`, `info`, `warn`, `error` + - Use `debug` during troubleshooting + +- `OPAMP_SERVER_URL` + - **Default:** *None (required)* if standalone image. If All-in-one or Docker Compose distribution this points to the deployed HyperDX instance. + - **Description:** URL of the OpAMP server used to manage the collector (e.g., HyperDX instance). This is port `4320` by default. + - **Guidance:** + - Must point to your HyperDX instance + - Enables dynamic configuration and secure ingestion + +- `HYPERDX_OTEL_EXPORTER_CLICKHOUSE_DATABASE` + - **Default:** `default` + - **Description:** ClickHouse database the collector writes telemetry data to. + - **Guidance:** + - Set if using a custom database name + - Ensure the specified user has access to this database - `OTEL_AGENT_FEATURE_GATE_ARG` - **Default:** `` diff --git a/docs/use-cases/observability/clickstack/deployment/_snippets/_json_support.md b/docs/use-cases/observability/clickstack/deployment/_snippets/_json_support.md index 9b3052f0d65..e4fada85177 100644 --- a/docs/use-cases/observability/clickstack/deployment/_snippets/_json_support.md +++ b/docs/use-cases/observability/clickstack/deployment/_snippets/_json_support.md @@ -12,4 +12,3 @@ In order to enable support for the JSON type users must set the following enviro - `OTEL_AGENT_FEATURE_GATE_ARG='--feature-gates=clickhouse.json'` - enables support in the OTel collector, ensuring schemas are created using the JSON type. - `BETA_CH_OTEL_JSON_SCHEMA_ENABLED=true` - enables support in the HyperDX application, allowing JSON data to be queried. - diff --git a/docs/use-cases/observability/clickstack/deployment/all-in-one.md b/docs/use-cases/observability/clickstack/deployment/all-in-one.md index 2aeab9f4c7a..db3317a6996 100644 --- a/docs/use-cases/observability/clickstack/deployment/all-in-one.md +++ b/docs/use-cases/observability/clickstack/deployment/all-in-one.md @@ -30,31 +30,19 @@ This option includes authentication, enabling the persistence of dashboards, ale
    - ### Deploy with Docker {#deploy-with-docker} - The following will run an OpenTelemetry collector (on port 4317 and 4318) and the HyperDX UI (on port 8080). - ```shell docker run -p 8080:8080 -p 4317:4317 -p 4318:4318 docker.hyperdx.io/hyperdx/hyperdx-all-in-one ``` - ### Navigate to the HyperDX UI {#navigate-to-hyperdx-ui} - Visit [http://localhost:8080](http://localhost:8080) to access the HyperDX UI. - -Create a user, providing a username and password which meets the requirements. - +Create a user, providing a username and password which meets the requirements. On clicking `Create` data sources will be created for the integrated ClickHouse instance. - HyperDX UI - For an example of using an alternative ClickHouse instance, see ["Create a ClickHouse Cloud connection"](/use-cases/observability/clickstack/getting-started#create-a-cloud-connection). - ### Ingest data {#ingest-data} - To ingest data see ["Ingesting data"](/use-cases/observability/clickstack/ingesting-data). - ## Persisting data and settings {#persisting-data-and-settings} @@ -94,7 +82,7 @@ docker run -p 8080:8080 -p 4317:4317 -p 4999:4318 docker.hyperdx.io/hyperdx/hype ## Using ClickHouse Cloud {#using-clickhouse-cloud} -This distribution can be used with ClickHouse Cloud. While the local ClickHouse instance will still be deployed (and ignored), the OTel collector can be configured to use a ClickHouse Cloud instance by setting the environment variables `CLICKHOUSE_ENDPOINT`, `CLICKHOUSE_USER` and `CLICKHOUSE_PASSWORD`. +This distribution can be used with ClickHouse Cloud. While the local ClickHouse instance will still be deployed (and ignored), the OTel collector can be configured to use a ClickHouse Cloud instance by setting the environment variables `CLICKHOUSE_ENDPOINT`, `CLICKHOUSE_USER` and `CLICKHOUSE_PASSWORD`. For example: @@ -120,4 +108,4 @@ For example: ```shell docker run -e OTEL_AGENT_FEATURE_GATE_ARG='--feature-gates=clickhouse.json' -e BETA_CH_OTEL_JSON_SCHEMA_ENABLED=true -p 8080:8080 -p 4317:4317 -p 4318:4318 docker.hyperdx.io/hyperdx/hyperdx-all-in-one -``` \ No newline at end of file +``` diff --git a/docs/use-cases/observability/clickstack/deployment/docker-compose.md b/docs/use-cases/observability/clickstack/deployment/docker-compose.md index fdf0d8e6037..fba2033a505 100644 --- a/docs/use-cases/observability/clickstack/deployment/docker-compose.md +++ b/docs/use-cases/observability/clickstack/deployment/docker-compose.md @@ -7,7 +7,6 @@ sidebar_position: 2 description: 'Deploying ClickStack with Docker Compose - The ClickHouse Observability Stack' --- - import Image from '@theme/IdealImage'; import hyperdx_login from '@site/static/images/use-cases/observability/hyperdx-login.png'; import hyperdx_logs from '@site/static/images/use-cases/observability/hyperdx-logs.png'; @@ -59,9 +58,7 @@ docker compose up ### Navigate to the HyperDX UI {#navigate-to-hyperdx-ui} Visit [http://localhost:8080](http://localhost:8080) to access the HyperDX UI. - -Create a user, providing a username and password which meets the requirements. - +Create a user, providing a username and password which meets the requirements. On clicking `Create` data sources will be created for the ClickHouse instance deployed with the Helm chart. :::note Overriding default connection @@ -74,10 +71,8 @@ For an example of using an alternative ClickHouse instance, see ["Create a Click ### Complete connection details {#complete-connection-details} -To connect to the deployed ClickHouse instance, simply click **Create** and accept the default settings. - +To connect to the deployed ClickHouse instance, simply click **Create** and accept the default settings. If you prefer to connect to your own **external ClickHouse cluster** e.g. ClickHouse Cloud, you can manually enter your connection credentials. - If prompted to create a source, retain all default values and complete the `Table` field with the value `otel_logs`. All other settings should be auto-detected, allowing you to click `Save New Source`. Create logs source @@ -153,28 +148,27 @@ This distribution can be used with ClickHouse Cloud. Users should: - On connecting to the HyperDX UI and creating a connection to ClickHouse, use your Cloud credentials. - - -To set these, modify the relevant services in the `docker-compose.yaml`: - - -```yaml - app: - image: ${HDX_IMAGE_REPO}/${IMAGE_NAME_DOCKERHUB}:${IMAGE_VERSION} - ports: - - ${HYPERDX_API_PORT}:${HYPERDX_API_PORT} - - ${HYPERDX_APP_PORT}:${HYPERDX_APP_PORT} - environment: - BETA_CH_OTEL_JSON_SCHEMA_ENABLED: true # enable JSON - FRONTEND_URL: ${HYPERDX_APP_URL}:${HYPERDX_APP_PORT} - HYPERDX_API_KEY: ${HYPERDX_API_KEY} - HYPERDX_API_PORT: ${HYPERDX_API_PORT} - # truncated for brevity - - otel-collector: - image: ${HDX_IMAGE_REPO}/${OTEL_COLLECTOR_IMAGE_NAME_DOCKERHUB}:${IMAGE_VERSION} - environment: - OTEL_AGENT_FEATURE_GATE_ARG: '--feature-gates=clickhouse.json' # enable JSON - CLICKHOUSE_ENDPOINT: 'tcp://ch-server:9000?dial_timeout=10s' + + + To set these, modify the relevant services in the `docker-compose.yaml`: + + ```yaml + app: + image: ${HDX_IMAGE_REPO}/${IMAGE_NAME_DOCKERHUB}:${IMAGE_VERSION} + ports: + - ${HYPERDX_API_PORT}:${HYPERDX_API_PORT} + - ${HYPERDX_APP_PORT}:${HYPERDX_APP_PORT} + environment: + BETA_CH_OTEL_JSON_SCHEMA_ENABLED: true # enable JSON + FRONTEND_URL: ${HYPERDX_APP_URL}:${HYPERDX_APP_PORT} + HYPERDX_API_KEY: ${HYPERDX_API_KEY} + HYPERDX_API_PORT: ${HYPERDX_API_PORT} # truncated for brevity -``` + + otel-collector: + image: ${HDX_IMAGE_REPO}/${OTEL_COLLECTOR_IMAGE_NAME_DOCKERHUB}:${IMAGE_VERSION} + environment: + OTEL_AGENT_FEATURE_GATE_ARG: '--feature-gates=clickhouse.json' # enable JSON + CLICKHOUSE_ENDPOINT: 'tcp://ch-server:9000?dial_timeout=10s' + # truncated for brevity + ``` diff --git a/docs/use-cases/observability/clickstack/deployment/helm.md b/docs/use-cases/observability/clickstack/deployment/helm.md index 3d0d76d019d..5a336228324 100644 --- a/docs/use-cases/observability/clickstack/deployment/helm.md +++ b/docs/use-cases/observability/clickstack/deployment/helm.md @@ -39,157 +39,110 @@ The chart supports standard Kubernetes best practices, including:
    - ### Prerequisites {#prerequisites} - - [Helm](https://helm.sh/) v3+ - Kubernetes cluster (v1.20+ recommended) - `kubectl` configured to interact with your cluster - ### Add the HyperDX Helm repository {#add-the-hyperdx-helm-repository} - Add the HyperDX Helm repository: - ```shell helm repo add hyperdx https://hyperdxio.github.io/helm-charts helm repo update ``` - ### Installing HyperDX {#installing-hyperdx} - To install the HyperDX chart with default values: - ```shell helm install my-hyperdx hyperdx/hdx-oss-v2 ``` - ### Verify the installation {#verify-the-installation} - Verify the installation: - ```shell kubectl get pods -l "app.kubernetes.io/name=hdx-oss-v2" ``` - When all pods are ready, proceed. - ### Forward ports {#forward-ports} - Port forwarding allows us to access and set up HyperDX. Users deploying to production should instead expose the service via an ingress or load balancer to ensure proper network access, TLS termination, and scalability. Port forwarding is best suited for local development or one-off administrative tasks, not long-term or high-availability environments. - ```shell kubectl port-forward \ - pod/$(kubectl get pod -l app.kubernetes.io/name=hdx-oss-v2 -o jsonpath='{.items[0].metadata.name}') \ - 8080:3000 +pod/$(kubectl get pod -l app.kubernetes.io/name=hdx-oss-v2 -o jsonpath='{.items[0].metadata.name}') \ +8080:3000 ``` - ### Navigate to the UI {#navigate-to-the-ui} - Visit [http://localhost:8080](http://localhost:8080) to access the HyperDX UI. - -Create a user, providing a username and password which means the requirements. - +Create a user, providing a username and password which means the requirements. HyperDX UI - - On clicking `Create`, data sources will be created for the ClickHouse instance deployed with the Helm chart. - :::note Overriding default connection You can override the default connection to the integrated ClickHouse instance. For details, see ["Using ClickHouse Cloud"](#using-clickhouse-cloud). ::: - For an example of using an alternative ClickHouse instance, see ["Create a ClickHouse Cloud connection"](/use-cases/observability/clickstack/getting-started#create-a-cloud-connection). - ### Customizing values (optional) {#customizing-values} - You can customize settings by using `--set` flags. For example: - ```shell helm install my-hyperdx hyperdx/hdx-oss-v2 --set key=value - Alternatively, edit the `values.yaml`. To retrieve the default values: - ```shell helm show values hyperdx/hdx-oss-v2 > values.yaml ``` - Example config: - ```yaml replicaCount: 2 resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 250m - memory: 256Mi +limits: +cpu: 500m +memory: 512Mi +requests: +cpu: 250m +memory: 256Mi ingress: - enabled: true - annotations: - kubernetes.io/ingress.class: nginx - hosts: - - host: hyperdx.example.com - paths: - - path: / - pathType: ImplementationSpecific +enabled: true +annotations: +kubernetes.io/ingress.class: nginx +hosts: +- host: hyperdx.example.com +paths: +- path: / +pathType: ImplementationSpecific ``` - ```shell helm install my-hyperdx hyperdx/hdx-oss-v2 -f values.yaml ``` - ### Using secrets (optional) {#using-secrets} - For handling sensitive data such as API keys or database credentials, use Kubernetes secrets. The HyperDX Helm charts provide default secret files that you can modify and apply to your cluster. - #### Using pre-configured secrets {#using-pre-configured-secrets} - The Helm chart includes a default secret template located at [`charts/hdx-oss-v2/templates/secrets.yaml`](https://github.com/hyperdxio/helm-charts/blob/main/charts/hdx-oss-v2/templates/secrets.yaml). This file provides a base structure for managing secrets. - - If you need to manually apply a secret, modify and apply the provided `secrets.yaml` template: - ```yaml apiVersion: v1 kind: Secret metadata: - name: hyperdx-secret - annotations: - "helm.sh/resource-policy": keep +name: hyperdx-secret +annotations: +"helm.sh/resource-policy": keep type: Opaque data: - API_KEY: +API_KEY: ``` - Apply the secret to your cluster: - ```shell kubectl apply -f secrets.yaml ``` - #### Creating a custom secret {#creating-a-custom-secret} - If you prefer, you can create a custom Kubernetes secret manually: - ```shell kubectl create secret generic hyperdx-secret \ - --from-literal=API_KEY=my-secret-api-key +--from-literal=API_KEY=my-secret-api-key ``` - #### Referencing a secret {#referencing-a-secret} - To reference a secret in `values.yaml`: - ```yaml hyperdx: - apiKey: - valueFrom: - secretKeyRef: - name: hyperdx-secret - key: API_KEY +apiKey: +valueFrom: +secretKeyRef: +name: hyperdx-secret +key: API_KEY ``` - ## Using ClickHouse Cloud {#using-clickhouse-cloud} @@ -240,7 +193,6 @@ helm install my-hyperdx hyperdx/hdx-oss-v2 -f values.yaml # helm upgrade my-hyperdx hyperdx/hdx-oss-v2 -f values.yaml ``` - ## Production notes {#production-notes} By default, this chart also installs ClickHouse and the OTel collector. However, for production, it is recommended that you manage ClickHouse and the OTel collector separately. @@ -309,7 +261,6 @@ kubectl get pods -l app.kubernetes.io/name=hdx-oss-v2 Users can set these environment variables via either parameters or the `values.yaml` e.g. - *values.yaml* ```yaml @@ -317,14 +268,14 @@ hyperdx: ... env: - name: BETA_CH_OTEL_JSON_SCHEMA_ENABLED - value: "true" + value: "true" otel: ... env: - name: OTEL_AGENT_FEATURE_GATE_ARG - value: "--feature-gates=clickhouse.json" -``` + value: "--feature-gates=clickhouse.json" + ``` or via `--set`: @@ -333,4 +284,4 @@ helm install myrelease hyperdx-helm --set "hyperdx.env[0].name=BETA_CH_OTEL_JSON --set "hyperdx.env[0].value=true" \ --set "otel.env[0].name=OTEL_AGENT_FEATURE_GATE_ARG" \ --set "otel.env[0].value=--feature-gates=clickhouse.json" -``` \ No newline at end of file +``` diff --git a/docs/use-cases/observability/clickstack/deployment/hyperdx-only.md b/docs/use-cases/observability/clickstack/deployment/hyperdx-only.md index aaf2b71cd35..633daa965f2 100644 --- a/docs/use-cases/observability/clickstack/deployment/hyperdx-only.md +++ b/docs/use-cases/observability/clickstack/deployment/hyperdx-only.md @@ -30,37 +30,23 @@ In this mode, data ingestion is left entirely to the user. You can ingest data i
    - ### Deploy with Docker {#deploy-hyperdx-with-docker} - -Run the following command, modifying `YOUR_MONGODB_URI` as required. - +Run the following command, modifying `YOUR_MONGODB_URI` as required. ```shell docker run -e MONGO_URI=mongodb://YOUR_MONGODB_URI -p 8080:8080 docker.hyperdx.io/hyperdx/hyperdx ``` - ### Navigate to the HyperDX UI {#navigate-to-hyperdx-ui} - Visit [http://localhost:8080](http://localhost:8080) to access the HyperDX UI. - -Create a user, providing a username and password which meets the requirements. - +Create a user, providing a username and password which meets the requirements. On clicking `Create` you'll be prompted for connection details. - HyperDX UI - ### Complete connection details {#complete-connection-details} - Connect to your own external ClickHouse cluster e.g. ClickHouse Cloud. - HyperDX Login - If prompted to create a source, retain all default values and complete the `Table` field with the value `otel_logs`. All other settings should be auto-detected, allowing you to click `Save New Source`. - :::note Creating a source Creating a source requires tables to exist in ClickHouse. If you don't have data, we recommend deploying the ClickStack OpenTelemetry collector to create tables. ::: - ## Using Docker Compose {#using-docker-compose} diff --git a/docs/use-cases/observability/clickstack/deployment/index.md b/docs/use-cases/observability/clickstack/deployment/index.md index 915f069c696..0b8c4a26866 100644 --- a/docs/use-cases/observability/clickstack/deployment/index.md +++ b/docs/use-cases/observability/clickstack/deployment/index.md @@ -16,4 +16,4 @@ Each of the deployment options are summarized below. The [Getting Started Guide] | Helm | Official Helm chart for Kubernetes-based deployments. Supports ClickHouse Cloud and production scaling. | Production deployments on Kubernetes | Kubernetes knowledge required, customization via Helm | [Helm](/use-cases/observability/clickstack/deployment/helm) | | Docker Compose | Deploy each ClickStack component individually via Docker Compose. | Local testing, proof of concepts, production on single server, BYO ClickHouse | No fault tolerance, requires managing multiple containers | [Docker Compose](/use-cases/observability/clickstack/deployment/docker-compose) | | HyperDX Only | Use HyperDX independently with your own ClickHouse and schema. | Existing ClickHouse users, custom event pipelines | No ClickHouse included, user must manage ingestion and schema | [HyperDX Only](/use-cases/observability/clickstack/deployment/hyperdx-only) | -| Local Mode Only | Runs entirely in the browser with local storage. No backend or persistence. | Demos, debugging, dev with HyperDX | No auth, no persistence, no alerting, single-user only | [Local Mode Only](/use-cases/observability/clickstack/deployment/local-mode-only) | \ No newline at end of file +| Local Mode Only | Runs entirely in the browser with local storage. No backend or persistence. | Demos, debugging, dev with HyperDX | No auth, no persistence, no alerting, single-user only | [Local Mode Only](/use-cases/observability/clickstack/deployment/local-mode-only) | diff --git a/docs/use-cases/observability/clickstack/deployment/local-mode-only.md b/docs/use-cases/observability/clickstack/deployment/local-mode-only.md index 368e9be2a94..e5f1d035702 100644 --- a/docs/use-cases/observability/clickstack/deployment/local-mode-only.md +++ b/docs/use-cases/observability/clickstack/deployment/local-mode-only.md @@ -19,7 +19,7 @@ Similar to the [all-in-one image](/use-cases/observability/clickstack/deployment * **OpenTelemetry (OTel) collector** (exposing OTLP on ports `4317` and `4318`) * **MongoDB** (for persistent application state) -**However, user authentication is disabled for this distribution of HyperDX** + **However, user authentication is disabled for this distribution of HyperDX** ### Suitable for {#suitable-for} @@ -31,32 +31,20 @@ Similar to the [all-in-one image](/use-cases/observability/clickstack/deployment
    - ### Deploy with Docker {#deploy-with-docker} - Local mode deploys the HyperDX UI only, accessible on port 8080. - ```shell docker run -p 8080:8080 docker.hyperdx.io/hyperdx/hyperdx-local ``` - ### Navigate to the HyperDX UI {#navigate-to-hyperdx-ui} - Visit [http://localhost:8080](http://localhost:8080) to access the HyperDX UI. - **You will not be prompted to create a user, as authentication is not enabled in this deployment mode.** - Connect to your own external ClickHouse cluster e.g. ClickHouse Cloud. - Create login - Create a source, retain all default values, and complete the `Table` field with the value `otel_logs`. All other settings should be auto-detected, allowing you to click `Save New Source`. - Create logs source - - For the local mode only image, users only need to set the `BETA_CH_OTEL_JSON_SCHEMA_ENABLED=true` parameter e.g. diff --git a/docs/use-cases/observability/clickstack/example-datasets/local-data.md b/docs/use-cases/observability/clickstack/example-datasets/local-data.md index 0f6e3444035..8f79db8ac05 100644 --- a/docs/use-cases/observability/clickstack/example-datasets/local-data.md +++ b/docs/use-cases/observability/clickstack/example-datasets/local-data.md @@ -24,145 +24,109 @@ This getting started guide allows you collect local logs and metrics from your s The following example assumes you have started ClickStack using the [instructions for the all-in-one image](/use-cases/observability/clickstack/getting-started) and connected to the [local ClickHouse instance](/use-cases/observability/clickstack/getting-started#complete-connection-credentials) or a [ClickHouse Cloud instance](/use-cases/observability/clickstack/getting-started#create-a-cloud-connection). - ## Navigate to the HyperDX UI {#navigate-to-the-hyperdx-ui} - Visit [http://localhost:8080](http://localhost:8080) to access the HyperDX UI. - ## Copy ingestion API key {#copy-ingestion-api-key} - Navigate to [`Team Settings`](http://localhost:8080/team) and copy the `Ingestion API Key` from the `API Keys` section. This API key ensures data ingestion through the OpenTelemetry collector is secure. - Copy API key - ## Create a local OpenTelemetry configuration {#create-otel-configuration} - Create a `otel-file-collector.yaml` file with the following content. - **Important**: Populate the value `` with your ingestion API key copied above. - ```yaml receivers: - filelog: - include: - - /var/log/**/*.log # Linux - - /var/log/syslog - - /var/log/messages - - /private/var/log/*.log # macOS - start_at: beginning # modify to collect new files only - - hostmetrics: - collection_interval: 1s - scrapers: - cpu: - metrics: - system.cpu.time: - enabled: true - system.cpu.utilization: - enabled: true - memory: - metrics: - system.memory.usage: - enabled: true - system.memory.utilization: - enabled: true - filesystem: - metrics: - system.filesystem.usage: - enabled: true - system.filesystem.utilization: - enabled: true - paging: - metrics: - system.paging.usage: - enabled: true - system.paging.utilization: - enabled: true - system.paging.faults: - enabled: true - disk: - load: - network: - processes: - +filelog: +include: +- /var/log/**/*.log # Linux +- /var/log/syslog +- /var/log/messages +- /private/var/log/*.log # macOS +start_at: beginning # modify to collect new files only +hostmetrics: +collection_interval: 1s +scrapers: +cpu: +metrics: +system.cpu.time: +enabled: true +system.cpu.utilization: +enabled: true +memory: +metrics: +system.memory.usage: +enabled: true +system.memory.utilization: +enabled: true +filesystem: +metrics: +system.filesystem.usage: +enabled: true +system.filesystem.utilization: +enabled: true +paging: +metrics: +system.paging.usage: +enabled: true +system.paging.utilization: +enabled: true +system.paging.faults: +enabled: true +disk: +load: +network: +processes: exporters: - otlp: - endpoint: localhost:4317 - headers: - authorization: - tls: - insecure: true - sending_queue: - enabled: true - num_consumers: 10 - queue_size: 262144 # 262,144 items × ~8 KB per item ≈ 2 GB - +otlp: +endpoint: localhost:4317 +headers: +authorization: +tls: +insecure: true +sending_queue: +enabled: true +num_consumers: 10 +queue_size: 262144 # 262,144 items × ~8 KB per item ≈ 2 GB service: - pipelines: - logs: - receivers: [filelog] - exporters: [otlp] - metrics: - receivers: [hostmetrics] - exporters: [otlp] +pipelines: +logs: +receivers: [filelog] +exporters: [otlp] +metrics: +receivers: [hostmetrics] +exporters: [otlp] ``` - This configuration collects system logs and metric for OSX and Linux systems, sending the results to ClickStack via the OTLP endpoint on port 4317. - :::note Ingestion timestamps This configuration adjusts timestamps at ingest, assigning an updated time value to each event. Users should ideally [preprocess or parse timestamps](/use-cases/observability/clickstack/ingesting-data/otel-collector#processing-filtering-transforming-enriching) using OTel processors or operators in their log files to ensure accurate event time is retained. - With this example setup, if the receiver or file processor is configured to start at the beginning of the file, all existing log entries will be assigned the same adjusted timestamp - the time of processing rather than the original event time. Any new events appended to the file will receive timestamps approximating their actual generation time. - To avoid this behavior, you can set the start position to `end` in the receiver configuration. This ensures only new entries are ingested and timestamped near their true arrival time. ::: - For more details on the OpenTelemetry (OTel) configuration structure, we recommend [the official guide](https://opentelemetry.io/docs/collector/configuration/). - ## Start the collector {#start-the-collector} - Run the following docker command to start an instance of the OTel collector. - ```shell docker run --network=host --rm -it \ - --user 0:0 \ - -v "$(pwd)/otel-file-collector.yaml":/etc/otel/config.yaml \ - -v /var/log:/var/log:ro \ - -v /private/var/log:/private/var/log:ro \ - otel/opentelemetry-collector-contrib:latest \ - --config /etc/otel/config.yaml +--user 0:0 \ +-v "$(pwd)/otel-file-collector.yaml":/etc/otel/config.yaml \ +-v /var/log:/var/log:ro \ +-v /private/var/log:/private/var/log:ro \ +otel/opentelemetry-collector-contrib:latest \ +--config /etc/otel/config.yaml ``` - :::note Root user We run the collector as the root user to access all system logs—this is necessary to capture logs from protected paths on Linux-based systems. However, this approach is not recommended for production. In production environments, the OpenTelemetry Collector should be deployed as a local agent with only the minimal permissions required to access the intended log sources. ::: - The collector will immediately begin collecting local system logs and metrics. - ## Explore system logs {#explore-system-logs} - Navigate to the HyperDX UI. The search UI should be populated with local system logs. Expand the filters to select the `system.log`: - HyperDX Local logs - ## Explore system metrics {#explore-system-metrics} - We can explore our metrics using charts. - -Navigate to the Chart Explorer via the left menu. Select the source `Metrics` and `Maximum` as the aggregation type. - +Navigate to the Chart Explorer via the left menu. Select the source `Metrics` and `Maximum` as the aggregation type. For the `Select a Metric` menu simply type `memory` before selecting `system.memory.utilization (Gauge)`. - Press the run button to visualize your memory utilization over time. - Memory over time - -Note the number is returned as a floating point `%`. To render it more clearly, select `Set number format`. - +Note the number is returned as a floating point `%`. To render it more clearly, select `Set number format`. Number format - From the subsequent menu you can select `Percentage` from the `Output format` drop down before clicking `Apply`. - Memory % of time - diff --git a/docs/use-cases/observability/clickstack/example-datasets/remote-demo-data.md b/docs/use-cases/observability/clickstack/example-datasets/remote-demo-data.md index 71610af04e6..97eb4b81213 100644 --- a/docs/use-cases/observability/clickstack/example-datasets/remote-demo-data.md +++ b/docs/use-cases/observability/clickstack/example-datasets/remote-demo-data.md @@ -38,7 +38,6 @@ import architecture from '@site/static/images/use-cases/observability/hyperdx-de import demo_sources from '@site/static/images/use-cases/observability/hyperdx-demo//demo_sources.png'; import edit_connection from '@site/static/images/use-cases/observability/edit_connection.png'; - **The following guide assumes you have deployed ClickStack using the [instructions for the all-in-one image](/use-cases/observability/clickstack/getting-started), or [Local Mode Only](/use-cases/observability/clickstack/deployment/local-mode-only) and completed initial user creation. Alternatively, users can skip all local setup and simply connect to our ClickStack hosted demo [play-clickstack.clickhouse.com](https://play-clickstack.clickhouse.com) which uses this dataset.** This guide uses a sample dataset hosted on the public ClickHouse playground at [sql.clickhouse.com](https://sql.clickhpouse.com), which you can connect to from your local ClickStack deployment. @@ -65,250 +64,142 @@ This demo reuses the official OpenTelemetry demo. This is composed of microservi _Credit: https://opentelemetry.io/docs/demo/architecture/_ -Further details on the demo can be found in the [official OpenTelemetry documentation](https://opentelemetry.io/docs/demo/). +Further details on the demo can be found in the [official OpenTelemetry documentation](https://opentelemetry.io/docs/demo/). ## Demo steps {#demo-steps} **We have instrumented this demo with [ClickStack SDKs](/use-cases/observability/clickstack/sdks), deploying the services in Kubernetes, from which metrics and logs have also been collected.** - ### Connect to the demo server {#connect-to-the-demo-server} - :::note Local-Only mode This step can be skipped if you clicked `Connect to Demo Server` when deploying in Local Mode. If using this mode, sources will be prefixed with `Demo_` e.g. `Demo_Logs` ::: - Navigate to `Team Settings` and click `Edit` for the `Local Connection`: - Edit Connection - Rename the connection to `Demo` and complete the subsequent form with the following connection details for the demo server: - - `Connection Name`: `Demo` - `Host`: `https://sql-clickhouse.clickhouse.com` - `Username`: `otel_demo` - `Password`: Leave empty - Edit Demo Connection - ### Modify the sources {#modify-sources} - :::note Local-Only mode This step can be skipped if you clicked `Connect to Demo Server` when deploying in Local Mode. If using this mode, sources will be prefixed with `Demo_` e.g. `Demo_Logs` ::: - -Scroll up to `Sources` and modify each of the sources - `Logs`, `Traces`, `Metrics`, and `Sessions` - to use the `otel_v2` database. - +Scroll up to `Sources` and modify each of the sources - `Logs`, `Traces`, `Metrics`, and `Sessions` - to use the `otel_v2` database. Edit Demo Source - :::note You may need to reload the page to ensure the full list of databases is listed in each source. ::: - ### Adjust the time frame {#adjust-the-timeframe} - Adjust the time to show all data from the previous `1 day` using the time picker in the top right. - Step 2 - You may a small difference in the number of errors in the overview bar chart, with a small increase in red in several consecutive bars. - :::note The location of the bars will differ depending on when you query the dataset. ::: - ### Filter to errors {#filter-to-errors} - To highlight occurrences of errors, use the `SeverityText` filter and select `error` to display only error-level entries. - The error should be more apparent: - Step 3 - ### Identify the error patterns {#identify-error-patterns} - With HyperDX's Clustering feature, you can automatically identify errors and group them into meaningful patterns. This accelerates user analysis when dealing with large volumes of log and traces. To use it, select `Event Patterns` from the `Analysis Mode` menu on the left panel. - The error clusters reveal issues related to failed payments, including a named pattern `Failed to place order`. Additional clusters also indicate problems charging cards and caches being full. - Step 4 - Note that these error clusters likely originate from different services. - ### Explore an error pattern {#explore-error-pattern} - Click the most obvious error clusters which correlates with our reported issue of users being able to complete payments: `Failed to place order`. - This will display a list of all occurrences of this error which are associated with the `frontend` service: - Step 5 - Select any of the resulting errors. The logs metadata will be shown in detail. Scrolling through both the `Overview` and `Column Values` suggests an issue with the charging cards due to a cache: - `failed to charge card: could not charge the card: rpc error: code = Unknown desc = Visa cache full: cannot add new item.` - Step 6 - ### Explore the infrastructure {#explore-the-infrastructure} - We've identified a cache-related error that's likely causing payment failures. We still need to identify where this issue is originating from in our microservice architecture. - Given the cache issue, it makes sense to investigate the underlying infrastructure - potentially we have memory problem in the associated pods? In ClickStack, logs and metrics are unified and displayed in context, making it easier to uncover the root cause quickly. - Select the `Infrastructure` tab to view the metrics associated with the underlying pods for the `frontend` service and widen the timespan to `1d`: - Step 7 - The issue does not seem to infrastructure related - no metrics have appreciably changed over the time period: either before or after the error. Close the infrastructure tab. - ### Explore a trace {#explore-a-trace} - In ClickStack, traces are also automatically correlated with both logs and metrics. Let's explore the trace linked to our selected log to identify the service responsible. - Select `Trace` to visualize the associated trace. Scrolling down through the subsequent view we can see how HyperDX is able to visualize the distributed trace across the microservices, connecting the spans in each service. A payment clearly involves multiple microservices, including those that performance checkout and currency conversions. - Step 8 - -By scrolling to the bottom of the view we can see that the `payment` service is causing the error, which in turn propagates back up the call chain. - +By scrolling to the bottom of the view we can see that the `payment` service is causing the error, which in turn propagates back up the call chain. Step 9 - -### Searching traces {#searching-traces} - +### Searching traces {#searching-traces} We have established users are failing to complete purchases due to a cache issue in the payment service. Let's explore the traces for this service in more detail to see if we can learn more about the root cause. - Switch to the main Search view by selecting `Search`. Switch the data source for `Traces` and select the `Results table` view. **Ensure the timespan is still over the last day.** - Step 10 - This view shows all traces in the last day. We know the issue originates in our payment service, so apply the `payment` filter to the `ServiceName`. - Step 11 - If we apply event clustering to the traces by selecting `Event Patterns`, we can immediately see our cache issue with the `payment` service. - Step 12 - ### Explore infrastructure for a trace {#explore-infrastructure-for-a-trace} - -Switch to the results view by clicking on `Results table`. Filter to errors using the `StatusCode` filter and `Error` value. - +Switch to the results view by clicking on `Results table`. Filter to errors using the `StatusCode` filter and `Error` value. Step 13 - Select a `Error: Visa cache full: cannot add new item.` error, switch to the `Infrastructure` tab and widen the timespan to `1d`. - Step 14 - By correlating traces with metrics we can see that memory and CPU increased with the `payment` service, before collapsing to `0` (we can attribute this to a pod restart) - suggesting the cache issue caused resource issues. We can expect this has impacted payment completion times. - -### Event deltas for faster resolution {#event-deltas-for-faster-resolution} - -Event Deltas help surface anomalies by attributing changes in performance or error rates to specific subsets of data—making it easier to quickly pinpoint the root cause. - +### Event deltas for faster resolution {#event-deltas-for-faster-resolution} +Event Deltas help surface anomalies by attributing changes in performance or error rates to specific subsets of data—making it easier to quickly pinpoint the root cause. While we know that the `payment` service has a cache issue, causing an increase in resource consumption, we haven't fully identified the root cause. - Return to the result table view and select the time period containing the errors to limit the data. Ensure you select several hours to the left of the errors and after if possible (the issue may still be occurring): - Step 15 - Remove the errors filter and select `Event Deltas` from the left `Analysis Mode` menu. - Step 16 - The top panel shows the distribution of timings, with colors indicating event density (number of spans). The subset of events outside of the main concentration are typically those worth investigating. - If we select the events with a duration greater than `200ms`, and apply the filter `Filter by selection`, we can limit our analysis to slower events: - Step 17 - With analysis performed on the subset of data, we can see most performance spikes are associated with `visa` transactions. - ### Using charts for more context {#using-charts-for-more-context} - -In ClickStack, we can chart any numeric value from logs, traces, or metrics for greater context. - +In ClickStack, we can chart any numeric value from logs, traces, or metrics for greater context. We have established: - - Our issue resides with the payment service - A cache is full - This caused increases in resource consumption - The issue prevented visa payments from completing - or at least causing them to take a long time to complete. - -
    - Select `Chart Explorer` from the left menu. Complete the following values to chart the time taken for payments to complete by chart type: - - `Data Source`: `Traces` - `Metric`: `Maximum` - `SQL Column`: `Duration` - `Where`: `ServiceName: payment` - `Timespan`: `Last 1 day` - -
    - -Clicking `▶️` will show how the performance of payments degraded over time. - +Clicking `▶️` will show how the performance of payments degraded over time. Step 18 - If we set `Group By` to `SpanAttributes['app.payment.card_type']` (just type `card` for autocomplete) we can see how the performance of the service degraded for Visa transactions relative to Mastercard: - Step 19 - Note than once the error occurs responses return in `0s`. - ### Exploring metrics more context {#exploring-metrics-for-more-context} - Finally, let's plot the cache size as a metric to see how it behaved over time, thus giving us more context. - Complete the following values: - - `Data Source`: `Metrics` - `Metric`: `Maximum` - `SQL Column`: `visa_validation_cache.size (gauge)` (just type `cache` for autocomplete) - `Where`: `ServiceName: payment` - `Group By`: `` - We can see how the cache size increased over a 4-5 hr period (likely after a software deployment) before reaching a maximum size of `100,000`. From the `Sample Matched Events` we can see our errors correlate with the cache reaching this limit and, after which it is recorded as having a size of `0` with responses also returning in `0s`. - Step 20 - In summary, by exploring logs, traces and finally metrics we have concluded: - - Our issue resides with the payment service - A change in service behavior, likely due to a deployment, resulted in a slow increase of a visa cache over a 4-5 hr period - reaching a maximum size of `100,000`. - This caused increases in resource consumption as the cache grew in size - likely due to a poor implementation - As the cache grew, the performance of Visa payments degraded - On reaching the maximum size, the cache rejected payments and reported itself as size `0`. - -### Using sessions {#using-sessions} - +### Using sessions {#using-sessions} Sessions allow us to replay the user experience, offering a visual account of how an error occurred from the user's perspective. While not typically used to diagnose root causes, they are valuable for confirming issues reported to customer support and can serve as a starting point for deeper investigation. - In HyperDX, sessions are linked to traces and logs, providing a complete view of the underlying cause. - For example, if the support team provides the email of a user who encountered a payment issue `Braulio.Roberts23@hotmail.com` - it's often more effective to begin with their session rather than directly searching logs or traces. - Navigate to the `Client Sessions` tab from the left menu before ensuring the data source is set to `Sessions` and the time period is set to the `Last 1 day`: - Step 21 - Search for `SpanAttributes.userEmail: Braulio` to find our customer's session. Selecting the session will show the browser events and associated spans for the customer's session on the left, with the user's browser experience re-rendered to the right: - Step 22 - -### Replaying sessions {#replaying-sessions} - -Sessions can be replayed by pressing the ▶️ button. Switching between `Highlighted` and `All Events` allows varying degrees of span granularity, with the former highlighting key events and errors. - +### Replaying sessions {#replaying-sessions} +Sessions can be replayed by pressing the ▶️ button. Switching between `Highlighted` and `All Events` allows varying degrees of span granularity, with the former highlighting key events and errors. If we scroll to the bottom of the spans we can see a `500` error associated with `/api/checkout`. Selecting the ▶️ button for this specific span moves the replay to this point in the session, allowing us to confirm the customer's experience - payment seems to simply not work with no error rendered. - Step 23 - Selecting the span we can confirm this was caused by an internal error. By clicking the `Trace` tab and scrolling though the connected spans, we are able to confirm the customer indeed was a victim of our cache issue. - Step 24 -
    This demo walks through a real-world incident involving failed payments in an e-commerce app, showing how ClickStack helps uncover root causes through unified logs, traces, metrics, and session replays - explore our [other getting started guides](/use-cases/observability/clickstack/sample-datasets) to dive deeper into specific features. diff --git a/docs/use-cases/observability/clickstack/example-datasets/sample-data.md b/docs/use-cases/observability/clickstack/example-datasets/sample-data.md index 99828480459..e92c40474ce 100644 --- a/docs/use-cases/observability/clickstack/example-datasets/sample-data.md +++ b/docs/use-cases/observability/clickstack/example-datasets/sample-data.md @@ -34,132 +34,78 @@ import copy_api_key from '@site/static/images/use-cases/observability/copy_api_k The following example assumes you have started ClickStack using the [instructions for the all-in-one image](/use-cases/observability/clickstack/getting-started) and connected to the [local ClickHouse instance](/use-cases/observability/clickstack/getting-started#complete-connection-credentials) or a [ClickHouse Cloud instance](/use-cases/observability/clickstack/getting-started#create-a-cloud-connection). - ## Navigate to the HyperDX UI {#navigate-to-the-hyperdx-ui} - Visit [http://localhost:8080](http://localhost:8080) to access the HyperDX UI. - HyperDX UI - ## Copy ingestion API key {#copy-ingestion-api-key} - Navigate to [`Team Settings`](http://localhost:8080/team) and copy the `Ingestion API Key` from the `API Keys` section. This API key ensures data ingestion through the OpenTelemetry collector is secure. - Copy API key - ## Download sample data {#download-sample-data} - In order to populate the UI with sample data, download the following file: - [Sample data](https://storage.googleapis.com/hyperdx/sample.tar.gz) - ```shell # curl curl -O https://storage.googleapis.com/hyperdx/sample.tar.gz # or # wget https://storage.googleapis.com/hyperdx/sample.tar.gz ``` - This file contains example logs, metrics, and traces from our public [OpenTelemetry demo](https://github.com/ClickHouse/opentelemetry-demo) - a simple e-commerce store with microservices. Copy this file to a directory of your choosing. - ## Load sample data {#load-sample-data} - -To load this data, we simply send it to the HTTP endpoint of the deployed OpenTelemetry (OTel) collector. - +To load this data, we simply send it to the HTTP endpoint of the deployed OpenTelemetry (OTel) collector. First, export the API key copied above. - ```shell # export API key export CLICKSTACK_API_KEY= ``` - Run the following command to send the data to the OTel collector: - ```shell for filename in $(tar -tf sample.tar.gz); do - endpoint="http://localhost:4318/v1/${filename%.json}" - echo "loading ${filename%.json}" - tar -xOf sample.tar.gz "$filename" | while read -r line; do - echo "$line" | curl -s -o /dev/null -X POST "$endpoint" \ - -H "Content-Type: application/json" \ - -H "authorization: ${CLICKSTACK_API_KEY}" \ - --data-binary @- - done +endpoint="http://localhost:4318/v1/${filename%.json}" +echo "loading ${filename%.json}" +tar -xOf sample.tar.gz "$filename" | while read -r line; do +echo "$line" | curl -s -o /dev/null -X POST "$endpoint" \ +-H "Content-Type: application/json" \ +-H "authorization: ${CLICKSTACK_API_KEY}" \ +--data-binary @- +done done ``` - This simulates OLTP log, trace, and metric sources sending data to the OTel collector. In production, these sources may be language clients or even other OTel collectors. - Returning to the `Search` view, you should see that data has started to load: - HyperDX search - Data loading will take a few minutes. Allow for the load to be completed before progressing to the next steps. - ## Explore sessions {#explore-sessions} - -Suppose we have reports that our users are experiencing issues paying for goods. We can view their experience using HyperDX's session replay capabilities. - +Suppose we have reports that our users are experiencing issues paying for goods. We can view their experience using HyperDX's session replay capabilities. Select [`Client Sessions`](http://localhost:8080/sessions?from=1747312320000&to=1747312920000&sessionSource=l1324572572) from the left menu. - Sessions - This view allows us to see front-end sessions for our e-commerce store. Sessions remain Anonymous until users check out and try to complete a purchase. - Note that some sessions with emails have an associated error, potentially confirming reports of failed transactions. - Select a trace with a failure and associated email. The subsequent view allows us to replay the user's session and review their issue. Press play to watch the session. - Session replay - The replay shows the user navigating the site, adding items to their cart. Feel free to skip to later in the session where they attempt to complete a payment. - :::tip -Any errors are annotated on the timeline in red. +Any errors are annotated on the timeline in red. ::: - -The user was unable to place the order, with no obvious error. Scroll to the bottom of the left panel, containing the network and console events from the user's browser. You will notice a 500 error was thrown on making a `/api/checkout` call. - +The user was unable to place the order, with no obvious error. Scroll to the bottom of the left panel, containing the network and console events from the user's browser. You will notice a 500 error was thrown on making a `/api/checkout` call. Error in session - Select this `500` error. Neither the `Overview` nor `Column Values` indicate the source of the issue, other than the fact the error is unexpected, causing an `Internal Error`. - ## Explore traces {#explore-traces} - -Navigate to the `Trace` tab to see the full distributed trace. - +Navigate to the `Trace` tab to see the full distributed trace. Session trace - -Scroll down the trace to see the origin of the error - the `checkout` service span. Select the `Payment` service span. - +Scroll down the trace to see the origin of the error - the `checkout` service span. Select the `Payment` service span. Span - Select the tab `Column Values` and scroll down. We can see the issue is associated with a cache being full. - Column values - Scrolling up and returning to the trace, we can see logs are correlated with the span, thanks to our earlier configuration. These provide further context. - Correlated log - -We've established that a cache is getting filled in the payment service, which is preventing payments from completing. - +We've established that a cache is getting filled in the payment service, which is preventing payments from completing. ## Explore logs {#explore-logs} - For further details, we can return to the [`Search` view](http://localhost:8080/search): - Select `Logs` from the sources and apply a filter to the `payment` service. - Logs - We can see that while the issue is recent, the number of impacted payments is high. Furthermore, a cache related to the visa payments appears to be causing issues. - ## Chart metrics {#chart-metrics} - While an error has clearly been introduced in the code, we can use metrics to confirm the cache size. Navigate to the `Chart Explorer` view. - Select `Metrics` as the data source. Complete the chart builder to plot the `Maximum` of `visa_validation_cache.size (Gauge)` and press the play button. The cache was clearly increasing before reaching a maximum size, after which errors were generated. - Metrics - diff --git a/docs/use-cases/observability/clickstack/getting-started.md b/docs/use-cases/observability/clickstack/getting-started.md index fd5c09b4f96..2209f7f73cf 100644 --- a/docs/use-cases/observability/clickstack/getting-started.md +++ b/docs/use-cases/observability/clickstack/getting-started.md @@ -21,7 +21,6 @@ import delete_connection from '@site/static/images/use-cases/observability/delet import created_sources from '@site/static/images/use-cases/observability/created_sources.png'; import edit_connection from '@site/static/images/use-cases/observability/edit_connection.png'; - Getting started with **ClickStack** is straightforward thanks to the availability of prebuilt Docker images. These images are based on the official ClickHouse Debian package and are available in multiple distributions to suit different use cases. ## Local deployment {#local-deployment} @@ -32,60 +31,41 @@ The simplest option is a **single-image distribution** that includes all core co - **OpenTelemetry (OTel) collector** - **ClickHouse** -This all-in-one image allows you to launch the full stack with a single command, making it ideal for testing, experimentation, or quick local deployments. + This all-in-one image allows you to launch the full stack with a single command, making it ideal for testing, experimentation, or quick local deployments. - ### Deploy stack with docker {#deploy-stack-with-docker} - The following will run an OpenTelemetry collector (on port 4317 and 4318) and the HyperDX UI (on port 8080). - ```shell docker run -p 8080:8080 -p 4317:4317 -p 4318:4318 docker.hyperdx.io/hyperdx/hyperdx-all-in-one ``` - :::note Persisting data and settings -To persist data and settings across restarts of the container, users can modify the above docker command to mount the paths `/data/db`, `/var/lib/clickhouse` and `/var/log/clickhouse-server`. - +To persist data and settings across restarts of the container, users can modify the above docker command to mount the paths `/data/db`, `/var/lib/clickhouse` and `/var/log/clickhouse-server`. For example: - ```shell # modify command to mount paths docker run \ - -p 8080:8080 \ - -p 4317:4317 \ - -p 4318:4318 \ - -v "$(pwd)/.volumes/db:/data/db" \ - -v "$(pwd)/.volumes/ch_data:/var/lib/clickhouse" \ - -v "$(pwd)/.volumes/ch_logs:/var/log/clickhouse-server" \ - docker.hyperdx.io/hyperdx/hyperdx-all-in-one +-p 8080:8080 \ +-p 4317:4317 \ +-p 4318:4318 \ +-v "$(pwd)/.volumes/db:/data/db" \ +-v "$(pwd)/.volumes/ch_data:/var/lib/clickhouse" \ +-v "$(pwd)/.volumes/ch_logs:/var/log/clickhouse-server" \ +docker.hyperdx.io/hyperdx/hyperdx-all-in-one ``` ::: - ### Navigate to the HyperDX UI {#navigate-to-hyperdx-ui} - Visit [http://localhost:8080](http://localhost:8080) to access the HyperDX UI. - -Create a user, providing a username and password that meets the complexity requirements. - +Create a user, providing a username and password that meets the complexity requirements. HyperDX UI - HyperDX will automatically connect to the local cluster and create data sources for the logs, traces, metrics, and sessions - allowing you to explore the product immediately. - ### Explore the product {#explore-the-product} - With the stack deployed, try one of our same datasets. - To continue using the local cluster: - - [Example dataset](/use-cases/observability/clickstack/getting-started/sample-data) - Load an example dataset from our public demo. Diagnose a simple issue. - [Local files and metrics](/use-cases/observability/clickstack/getting-started/local-data) - Load local files and monitor system on OSX or Linux using a local OTel collector. - -
    -Alternatively, you can connect to a demo cluster where you can explore a larger dataset: - +Alternatively, you can connect to a demo cluster where you can explore a larger dataset: - [Remote demo dataset](/use-cases/observability/clickstack/getting-started/remote-demo-data) - Explore a demo dataset in our demo ClickHouse service. -
    ## Deploy with ClickHouse Cloud {#deploy-with-clickhouse-cloud} @@ -93,73 +73,47 @@ Alternatively, you can connect to a demo cluster where you can explore a larger Users can deploy ClickStack against ClickHouse Cloud, benefiting from a fully managed, secure backend while retaining complete control over ingestion, schema, and observability workflows. - ### Create a ClickHouse Cloud service {#create-a-service} - Follow the [getting started guide for ClickHouse Cloud](/getting-started/quick-start/cloud#1-create-a-clickhouse-service) to create a service. - ### Copy connection details {#copy-cloud-connection-details} - -To find the connection details for HyperDX, navigate to the ClickHouse Cloud console and click the Connect button on the sidebar. - +To find the connection details for HyperDX, navigate to the ClickHouse Cloud console and click the Connect button on the sidebar. Copy the HTTP connection details, specifically the HTTPS endpoint (`endpoint`) and password. - Connect Cloud - :::note Deploying to production While we will use the `default` user to connect HyperDX, we recommend creating a dedicated user when [going to production](/use-cases/observability/clickstack/production#create-a-user). ::: - ### Deploy with docker {#deploy-with-docker} - Open a terminal and export the credentials copied above: - ```shell export CLICKHOUSE_USER=default export CLICKHOUSE_ENDPOINT= export CLICKHOUSE_PASSWORD= ``` - Run the following docker command: - ```shell docker run -e CLICKHOUSE_ENDPOINT=${CLICKHOUSE_ENDPOINT} -e CLICKHOUSE_USER=default -e CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD} -p 8080:8080 -p 4317:4317 -p 4318:4318 docker.hyperdx.io/hyperdx/hyperdx-all-in-one ``` - This will expose an OpenTelemetry collector (on port 4317 and 4318), and the HyperDX UI (on port 8080). - ### Navigate to the HyperDX UI {#navigate-to-hyperdx-ui-cloud} - Visit [http://localhost:8080](http://localhost:8080) to access the HyperDX UI. - -Create a user, providing a username and password which meets the complexity requirements. - +Create a user, providing a username and password which meets the complexity requirements. HyperDX Login - ### Create a ClickHouse Cloud connection {#create-a-cloud-connection} - Navigate to `Team Settings` and click `Edit` for the `Local Connection`: - Edit Connection - Rename the connection to `Cloud` and complete the subsequent form with your ClickHouse Cloud service credentials before clicking `Save`: - Create Cloud connection - ### Explore the product {#explore-the-product-cloud} - With the stack deployed, try one of our same datasets. - - [Example dataset](/use-cases/observability/clickstack/getting-started/sample-data) - Load an example dataset from our public demo. Diagnose a simple issue. - [Local files and metrics](/use-cases/observability/clickstack/getting-started/local-data) - Load local files and monitor the system on OSX or Linux using a local OTel collector. - ## Local mode {#local-mode} -Local mode is a way to deploy HyperDX without needing to authenticate. +Local mode is a way to deploy HyperDX without needing to authenticate. -Authentication is not supported. +Authentication is not supported. This mode is intended to be used for quick testing, development, demos and debugging use cases where authentication and settings persistence is not necessary. @@ -170,25 +124,15 @@ You can use a hosted version of HyperDX in local mode available at [play.hyperdx ### Self-hosted version {#self-hosted-version} - ### Run with docker {#run-local-with-docker} - The self-hosted local mode image comes with an OpenTelemetry collector and a ClickHouse server pre-configured as well. This makes it easy to consume telemetry data from your applications and visualize it in HyperDX with minimal external setup. To get started with the self-hosted version, simply run the Docker container with the appropriate ports forwarded: - ```shell docker run -p 8080:8080 docker.hyperdx.io/hyperdx/hyperdx-local ``` - You will not be promoted to create a user as local mode does not include authentication. - ### Complete connection credentials {#complete-connection-credentials} - To connect to your own **external ClickHouse cluster**, you can manually enter your connection credentials. - Alternatively, for a quick exploration of the product, you can also click **Connect to Demo Server** to access preloaded datasets and try ClickStack with no setup required. - Credentials - If connecting to the demo server, users can explore the dataset with the [demo dataset instructions](/use-cases/observability/clickstack/getting-started/remote-demo-data). - diff --git a/docs/use-cases/observability/clickstack/index.md b/docs/use-cases/observability/clickstack/index.md index 0d8edef60d2..300779be624 100644 --- a/docs/use-cases/observability/clickstack/index.md +++ b/docs/use-cases/observability/clickstack/index.md @@ -19,4 +19,3 @@ description: 'Landing page for the ClickHouse Observability Stack' | [Ingesting Data](/use-cases/observability/clickstack/ingesting-data) | Guidelines for ingesting data to ClickStack | | [Search](/use-cases/observability/clickstack/search) | How to search and query your observability data | | [Production](/use-cases/observability/clickstack/production) | Best practices for production deployment | - diff --git a/docs/use-cases/observability/clickstack/ingesting-data/collector.md b/docs/use-cases/observability/clickstack/ingesting-data/collector.md index 4d2c6ae508f..c3d0fd02bad 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/collector.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/collector.md @@ -21,13 +21,13 @@ This page includes details on configuring the official ClickStack OpenTelemetry OpenTelemetry collectors can be deployed in two principal roles: -- **Agent** - Agent instances collect data at the edge e.g. on servers or on Kubernetes nodes, or receive events directly from applications - instrumented with an OpenTelemetry SDK. In the latter case, the agent instance runs with the application or on the same host as the application (such as a sidecar or a DaemonSet). Agents can either send their data directly to ClickHouse or to a gateway instance. In the former case, this is referred to as [Agent deployment pattern](https://opentelemetry.io/docs/collector/deployment/agent/). +- **Agent** - Agent instances collect data at the edge e.g. on servers or on Kubernetes nodes, or receive events directly from applications - instrumented with an OpenTelemetry SDK. In the latter case, the agent instance runs with the application or on the same host as the application (such as a sidecar or a DaemonSet). Agents can either send their data directly to ClickHouse or to a gateway instance. In the former case, this is referred to as [Agent deployment pattern](https://opentelemetry.io/docs/collector/deployment/agent/). -- **Gateway** - Gateway instances provide a standalone service (for example, a deployment in Kubernetes), typically per cluster, per data center, or per region. These receive events from applications (or other collectors as agents) via a single OTLP endpoint. Typically, a set of gateway instances are deployed, with an out-of-the-box load balancer used to distribute the load amongst them. If all agents and applications send their signals to this single endpoint, it is often referred to as a [Gateway deployment pattern](https://opentelemetry.io/docs/collector/deployment/gateway/). +- **Gateway** - Gateway instances provide a standalone service (for example, a deployment in Kubernetes), typically per cluster, per data center, or per region. These receive events from applications (or other collectors as agents) via a single OTLP endpoint. Typically, a set of gateway instances are deployed, with an out-of-the-box load balancer used to distribute the load amongst them. If all agents and applications send their signals to this single endpoint, it is often referred to as a [Gateway deployment pattern](https://opentelemetry.io/docs/collector/deployment/gateway/). -**Important: The collector, including in default distributions of ClickStack, assumes the [gateway role described below](#collector-roles), receiving data from agents or SDKs.** + **Important: The collector, including in default distributions of ClickStack, assumes the [gateway role described below](#collector-roles), receiving data from agents or SDKs.** -Users deploying OTel collectors in the agent role will typically use the [default contrib distribution of the collector](https://github.com/open-telemetry/opentelemetry-collector-contrib) and not the ClickStack version but are free to use other OTLP compatible technologies such as [Fluentd](https://www.fluentd.org/) and [Vector](https://vector.dev/). + Users deploying OTel collectors in the agent role will typically use the [default contrib distribution of the collector](https://github.com/open-telemetry/opentelemetry-collector-contrib) and not the ClickStack version but are free to use other OTLP compatible technologies such as [Fluentd](https://www.fluentd.org/) and [Vector](https://vector.dev/). ## Deploying the collector {#configuring-the-collector} @@ -43,7 +43,7 @@ docker run -e OPAMP_SERVER_URL=${OPAMP_SERVER_URL} -e CLICKHOUSE_ENDPOINT=${CLIC Note that we can overwrite the target ClickHouse instance with environment variables for `CLICKHOUSE_ENDPOINT`, `CLICKHOUSE_USERNAME`, and `CLICKHOUSE_PASSWORD`. The `CLICKHOUSE_ENDPOINT` should be the full ClickHouse HTTP endpoint, including the protocol and port—for example, `http://localhost:8123`. -**These environment variables can be used with any of the docker distributions which include the connector.** +**These environment variables can be used with any of the docker distributions which include the connector.** The `OPAMP_SERVER_URL` should point to your HyperDX deployment - for example, `http://localhost:4320`. HyperDX exposes an OpAMP (Open Agent Management Protocol) server at `/v1/opamp` on port `4320` by default. Make sure to expose this port from the container running HyperDX (e.g., using `-p 4320:4320`). @@ -91,10 +91,10 @@ With Docker Compose, modify the collector configuration using the same environme - '4317:4317' # OTLP gRPC receiver - '4318:4318' # OTLP http receiver - '8888:8888' # metrics extension - restart: always - networks: + restart: always + networks: - internal -``` + ``` ### Advanced configuration {#advanced-configuration} @@ -106,7 +106,6 @@ The default ClickStack configuration for the OpenTelemetry (OTel) collector can For details on configuring OTel collectors, including [`receivers`](https://opentelemetry.io/docs/collector/transforming-telemetry/), [`operators`](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/stanza/docs/operators/README.md), and [`processors`](https://opentelemetry.io/docs/collector/configuration/#processors), we recommend the [official OpenTelemetry collector documentation](https://opentelemetry.io/docs/collector/configuration). - ## Securing the collector {#securing-the-collector} The ClickStack distribution of the OpenTelemetry collector includes built-in support for OpAMP (Open Agent Management Protocol), which it uses to securely configure and manage the OTLP endpoint. On startup, users must provide an `OPAMP_SERVER_URL` environment variable — this should point to the HyperDX app, which hosts the OpAMP API at `/v1/opamp`. @@ -119,11 +118,11 @@ To further secure your deployment, we recommend: - Configuring the collector to communicate with ClickHouse over HTTPS. - Create a dedicated user for ingestion with limited permissions - see below. -- Enabling TLS for the OTLP endpoint, ensuring encrypted communication between SDKs/agents and the collector. **Currently, this requires users to deploy a default distribution of the collector and manage the configuration themselves**. +- Enabling TLS for the OTLP endpoint, ensuring encrypted communication between SDKs/agents and the collector. **Currently, this requires users to deploy a default distribution of the collector and manage the configuration themselves**. ### Creating an ingestion user {#creating-an-ingestion-user} -We recommend creating a dedicated database and user for the OTel collector for ingestion into ClickHouse. This should have the ability to create and insert into the [tables created and used by ClickStack](/use-cases/observability/clickstack/ingesting-data/schemas). +We recommend creating a dedicated database and user for the OTel collector for ingestion into ClickHouse. This should have the ability to create and insert into the [tables created and used by ClickStack](/use-cases/observability/clickstack/ingesting-data/schemas). ```sql CREATE DATABASE otel; @@ -140,9 +139,9 @@ Users will invariably want to filter, transform, and enrich event messages durin - Deploy their own version of the OTel collector performing filtering and processing, sending events to the ClickStack collector via OTLP for ingestion into ClickHouse. - Deploy their own version of the OTel collector and send events directly to ClickHouse using the ClickHouse exporter. -If processing is done using the OTel collector, we recommend doing transformations at gateway instances and minimizing any work done at agent instances. This will ensure the resources required by agents at the edge, running on servers, are as minimal as possible. Typically, we see users only performing filtering (to minimize unnecessary network usage), timestamp setting (via operators), and enrichment, which requires context in agents. For example, if gateway instances reside in a different Kubernetes cluster, k8s enrichment will need to occur in the agent. + If processing is done using the OTel collector, we recommend doing transformations at gateway instances and minimizing any work done at agent instances. This will ensure the resources required by agents at the edge, running on servers, are as minimal as possible. Typically, we see users only performing filtering (to minimize unnecessary network usage), timestamp setting (via operators), and enrichment, which requires context in agents. For example, if gateway instances reside in a different Kubernetes cluster, k8s enrichment will need to occur in the agent. -OpenTelemetry supports the following processing and filtering features users can exploit: + OpenTelemetry supports the following processing and filtering features users can exploit: - **Processors** - Processors take the data collected by [receivers and modify or transform](https://opentelemetry.io/docs/collector/transforming-telemetry/) it before sending it to the exporters. Processors are applied in the order as configured in the `processors` section of the collector configuration. These are optional, but the minimal set is [typically recommended](https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor#recommended-processors). When using an OTel collector with ClickHouse, we recommend limiting processors to: @@ -154,7 +153,7 @@ OpenTelemetry supports the following processing and filtering features users can - **Operators** - [Operators](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/stanza/docs/operators/README.md) provide the most basic unit of processing available at the receiver. Basic parsing is supported, allowing fields such as the Severity and Timestamp to be set. JSON and regex parsing are supported here along with event filtering and basic transformations. We recommend performing event filtering here. -We recommend users avoid doing excessive event processing using operators or [transform processors](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/transformprocessor/README.md). These can incur considerable memory and CPU overhead, especially JSON parsing. It is possible to do all processing in ClickHouse at insert time with materialized views and columns with some exceptions - specifically, context-aware enrichment e.g. adding of k8s metadata. For more details, see [Extracting structure with SQL](/use-cases/observability/schema-design#extracting-structure-with-sql). + We recommend users avoid doing excessive event processing using operators or [transform processors](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/transformprocessor/README.md). These can incur considerable memory and CPU overhead, especially JSON parsing. It is possible to do all processing in ClickHouse at insert time with materialized views and columns with some exceptions - specifically, context-aware enrichment e.g. adding of k8s metadata. For more details, see [Extracting structure with SQL](/use-cases/observability/schema-design#extracting-structure-with-sql). ### Example {#example-processing} @@ -162,37 +161,36 @@ The following configuration shows collection of this [unstructured log file](htt Note the use of operators to extract structure from the log lines (`regex_parser`) and filter events, along with a processor to batch events and limit memory usage. - ```yaml # config-unstructured-logs-with-processor.yaml receivers: filelog: include: - /opt/data/logs/access-unstructured.log - start_at: beginning - operators: + start_at: beginning + operators: - type: regex_parser - regex: '^(?P[\d.]+)\s+-\s+-\s+\[(?P[^\]]+)\]\s+"(?P[A-Z]+)\s+(?P[^\s]+)\s+HTTP/[^\s]+"\s+(?P\d+)\s+(?P\d+)\s+"(?P[^"]*)"\s+"(?P[^"]*)"' - timestamp: + regex: '^(?P[\d.]+)\s+-\s+-\s+\[(?P[^\]]+)\]\s+"(?P[A-Z]+)\s+(?P[^\s]+)\s+HTTP/[^\s]+"\s+(?P\d+)\s+(?P\d+)\s+"(?P[^"]*)"\s+"(?P[^"]*)"' + timestamp: parse_from: attributes.timestamp layout: '%d/%b/%Y:%H:%M:%S %z' #22/Jan/2019:03:56:14 +0330 -processors: - batch: - timeout: 1s - send_batch_size: 100 - memory_limiter: - check_interval: 1s - limit_mib: 2048 - spike_limit_mib: 256 -exporters: + processors: + batch: + timeout: 1s + send_batch_size: 100 + memory_limiter: + check_interval: 1s + limit_mib: 2048 + spike_limit_mib: 256 + exporters: # HTTP setup otlphttp/hdx: endpoint: 'http://localhost:4318' headers: authorization: compression: gzip - + # gRPC setup (alternative) otlp/hdx: endpoint: 'localhost:4317' @@ -227,9 +225,9 @@ By default, inserts into ClickHouse are synchronous and idempotent if identical. - (1) If the node receiving the data has issues, the insert query will time out (or get a more specific error) and not receive an acknowledgment. - (2) If the data got written by the node, but the acknowledgement can't be returned to the sender of the query because of network interruptions, the sender will either get a timeout or a network error. -From the collector's perspective, (1) and (2) can be hard to distinguish. However, in both cases, the unacknowledged insert can just be retried immediately. As long as the retried insert query contains the same data in the same order, ClickHouse will automatically ignore the retried insert if the original (unacknowledged) insert succeeded. + From the collector's perspective, (1) and (2) can be hard to distinguish. However, in both cases, the unacknowledged insert can just be retried immediately. As long as the retried insert query contains the same data in the same order, ClickHouse will automatically ignore the retried insert if the original (unacknowledged) insert succeeded. -For this reason, the ClickStack distribution of the OTel collector uses the [batch processor](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md). This ensures inserts are sent as consistent batches of rows satisfying the above requirements. If a collector is expected to have high throughput (events per second), and at least 5000 events can be sent in each insert, this is usually the only batching required in the pipeline. In this case the collector will flush batches before the batch processor's `timeout` is reached, ensuring the end-to-end latency of the pipeline remains low and batches are of a consistent size. + For this reason, the ClickStack distribution of the OTel collector uses the [batch processor](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md). This ensures inserts are sent as consistent batches of rows satisfying the above requirements. If a collector is expected to have high throughput (events per second), and at least 5000 events can be sent in each insert, this is usually the only batching required in the pipeline. In this case the collector will flush batches before the batch processor's `timeout` is reached, ensuring the end-to-end latency of the pipeline remains low and batches are of a consistent size. ### Use asynchronous inserts {#use-asynchronous-inserts} @@ -259,7 +257,7 @@ The ClickStack OTel collector acts a Gateway instance - see [Collector roles](#c Scaling with gateways -The objective of this architecture is to offload computationally intensive processing from the agents, thereby minimizing their resource usage. These ClickStack gateways can perform transformation tasks that would otherwise need to be done by agents. Furthermore, by aggregating events from many agents, the gateways can ensure large batches are sent to ClickHouse - allowing efficient insertion. These gateway collectors can easily be scaled as more agents and SDK sources are added and event throughput increases. +The objective of this architecture is to offload computationally intensive processing from the agents, thereby minimizing their resource usage. These ClickStack gateways can perform transformation tasks that would otherwise need to be done by agents. Furthermore, by aggregating events from many agents, the gateways can ensure large batches are sent to ClickHouse - allowing efficient insertion. These gateway collectors can easily be scaled as more agents and SDK sources are added and event throughput increases. ### Adding Kafka {#adding-kafka} @@ -306,7 +304,7 @@ The JSON type offers the following benefits to ClickStack users: - **Type preservation** - Numbers stay numbers, booleans stay booleans—no more flattening everything into strings. This means fewer casts, simpler queries, and more accurate aggregations. - **Path-level columns** - Each JSON path becomes its own sub-column, reducing I/O. Queries only read the fields they need, unlocking major performance gains over the old Map type which required the entire column to be read in order to query a specific field. - **Deep nesting just works** - Naturally handle complex, deeply nested structures without manual flattening (as required by the Map type) and subsequent awkward JSONExtract functions. -- **Dynamic, evolving schemas** - Perfect for observability data where teams add new tags and attributes over time. JSON handles these changes automatically, without schema migrations. +- **Dynamic, evolving schemas** - Perfect for observability data where teams add new tags and attributes over time. JSON handles these changes automatically, without schema migrations. - **Faster queries, lower memory** - Typical aggregations over attributes like `LogAttributes` see 5-10x less data read and dramatic speedups, cutting both query time and peak memory usage. - **Simple management** - No need to pre-materialize columns for performance. Each field becomes its own sub-column, delivering the same speed as native ClickHouse columns. @@ -330,39 +328,25 @@ docker run -e OTEL_AGENT_FEATURE_GATE_ARG='--feature-gates=clickhouse.json' -e O The [JSON type](/interfaces/formats/JSON) type is not backwards compatible with existing map-based schemas. New tables will be created using the `JSON` type. ::: - To migrate from the Map-based schemas, follow these steps: - - #### Stop the OTel collector {#stop-the-collector} - #### Rename existing tables and update sources {#rename-existing-tables-sources} - -Rename existing tables and update data sources in HyperDX. - +Rename existing tables and update data sources in HyperDX. For example: - ```sql RENAME TABLE otel_logs TO otel_logs_map; RENAME TABLE otel_metrics TO otel_metrics_map; ``` - #### Deploy the collector {#deploy-the-collector} - Deploy the collector with `OTEL_AGENT_FEATURE_GATE_ARG` set. - #### Restart the HyperDX container with JSON schema support {#restart-the-hyperdx-container} - ```shell export BETA_CH_OTEL_JSON_SCHEMA_ENABLED=true ``` - #### Create new data sources {#create-new-data-sources} - Create new data sources in HyperDX pointing to the JSON tables. - #### Migrating existing data (optional) {#migrating-existing-data} diff --git a/docs/use-cases/observability/clickstack/ingesting-data/kubernetes.md b/docs/use-cases/observability/clickstack/ingesting-data/kubernetes.md index caacdba2858..23730a55549 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/kubernetes.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/kubernetes.md @@ -13,9 +13,9 @@ This guide integrates the following: - **Logs** - **Infra Metrics** -:::note -To send over application-level metrics or APM/traces, you'll need to add the corresponding language integration to your application as well. -::: + :::note + To send over application-level metrics or APM/traces, you'll need to add the corresponding language integration to your application as well. + ::: ## Creating the OTel Helm chart configuration files {#creating-the-otel-helm-chart-config-files} @@ -37,9 +37,9 @@ clusterRole: rules: - apiGroups: - '' - resources: + resources: - nodes/proxy - verbs: + verbs: - get presets: @@ -108,10 +108,10 @@ config: logs: exporters: - otlphttp - metrics: - exporters: + metrics: + exporters: - otlphttp -``` + ``` ### Creating the deployment configuration {#creating-the-deployment-configuration} @@ -159,10 +159,10 @@ config: logs: exporters: - otlphttp - metrics: - exporters: + metrics: + exporters: - otlphttp -``` + ``` ## Deploying the OpenTelemetry collector {#deploying-the-otel-collector} @@ -216,31 +216,31 @@ spec: spec: containers: - name: app-container - image: my-image - env: + image: my-image + env: # ... other environment variables # Collect K8s metadata from the downward API to forward to the app - name: POD_NAME - valueFrom: + valueFrom: fieldRef: fieldPath: metadata.name - name: POD_UID - valueFrom: + valueFrom: fieldRef: fieldPath: metadata.uid - name: POD_NAMESPACE - valueFrom: + valueFrom: fieldRef: fieldPath: metadata.namespace - name: NODE_NAME - valueFrom: + valueFrom: fieldRef: fieldPath: spec.nodeName - name: DEPLOYMENT_NAME - valueFrom: + valueFrom: fieldRef: fieldPath: metadata.labels['deployment'] # Forward the K8s metadata to the app via OTEL_RESOURCE_ATTRIBUTES - name: OTEL_RESOURCE_ATTRIBUTES - value: k8s.pod.name=$(POD_NAME),k8s.pod.uid=$(POD_UID),k8s.namespace.name=$(POD_NAMESPACE),k8s.node.name=$(NODE_NAME),k8s.deployment.name=$(DEPLOYMENT_NAME) -``` + value: k8s.pod.name=$(POD_NAME),k8s.pod.uid=$(POD_UID),k8s.namespace.name=$(POD_NAMESPACE),k8s.node.name=$(NODE_NAME),k8s.deployment.name=$(DEPLOYMENT_NAME) + ``` diff --git a/docs/use-cases/observability/clickstack/ingesting-data/opentelemetry.md b/docs/use-cases/observability/clickstack/ingesting-data/opentelemetry.md index 13127876c26..94ef2cca5bf 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/opentelemetry.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/opentelemetry.md @@ -30,11 +30,11 @@ If you're using the [HyperDX-only](/use-cases/observability/clickstack/deploymen - Running your own OpenTelemetry collector and pointing it at ClickHouse - see below. - Sending directly to ClickHouse using alternative tooling, such as [Vector](https://vector.dev/), [Fluentd](https://www.fluentd.org/) etc, or even the default [OTel contrib collector distribution](https://github.com/open-telemetry/opentelemetry-collector-contrib). -:::note We recommend using the ClickStack OpenTelemetry collector -This allows users to benefit from standardized ingestion, enforced schemas, and out-of-the-box compatibility with the HyperDX UI. Using the default schema enables automatic source detection and preconfigured column mappings. -::: + :::note We recommend using the ClickStack OpenTelemetry collector + This allows users to benefit from standardized ingestion, enforced schemas, and out-of-the-box compatibility with the HyperDX UI. Using the default schema enables automatic source detection and preconfigured column mappings. + ::: -For further details see ["Deploying the collector"](/use-cases/observability/clickstack/ingesting-data/otel-collector). + For further details see ["Deploying the collector"](/use-cases/observability/clickstack/ingesting-data/otel-collector). ## Sending OpenTelemetry data {#sending-otel-data} @@ -43,25 +43,23 @@ To send data to ClickStack, point your OpenTelemetry instrumentation to the foll - **HTTP (OTLP):** `http://localhost:4318` - **gRPC (OTLP):** `localhost:4317` -For most [language SDKs](/use-cases/observability/clickstack/sdks) and telemetry libraries that support OpenTelemetry, users can simply set `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable in your application: + For most [language SDKs](/use-cases/observability/clickstack/sdks) and telemetry libraries that support OpenTelemetry, users can simply set `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable in your application: -```shell -export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 -``` - -In addition, an authorization header containing the API ingestion key is required. You can find the key in the HyperDX app under `Team Settings → API Keys`. + ```shell + export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 + ``` -Ingestion keys + In addition, an authorization header containing the API ingestion key is required. You can find the key in the HyperDX app under `Team Settings → API Keys`. + Ingestion keys -For language SDKs, this can either be set by an `init` function or via an`OTEL_EXPORTER_OTLP_HEADERS` environment variable e.g.: + For language SDKs, this can either be set by an `init` function or via an`OTEL_EXPORTER_OTLP_HEADERS` environment variable e.g.: -```shell -OTEL_EXPORTER_OTLP_HEADERS='authorization=' -``` - -Agents should likewise include this authorization header in any OTLP communication. For example, if deploying a [contrib distribution of the OTel collector](https://github.com/open-telemetry/opentelemetry-collector-contrib) in the agent role, they can use the OTLP exporter. An example agent config consuming this [structured log file](https://datasets-documentation.s3.eu-west-3.amazonaws.com/http_logs/access-structured.log.gz), is shown below. Note the need to specify an authorization key - see ``. + ```shell + OTEL_EXPORTER_OTLP_HEADERS='authorization=' + ``` + Agents should likewise include this authorization header in any OTLP communication. For example, if deploying a [contrib distribution of the OTel collector](https://github.com/open-telemetry/opentelemetry-collector-contrib) in the agent role, they can use the OTLP exporter. An example agent config consuming this [structured log file](https://datasets-documentation.s3.eu-west-3.amazonaws.com/http_logs/access-structured.log.gz), is shown below. Note the need to specify an authorization key - see ``. ```yaml # clickhouse-agent-config.yaml @@ -69,20 +67,20 @@ receivers: filelog: include: - /opt/data/logs/access-structured.log - start_at: beginning - operators: + start_at: beginning + operators: - type: json_parser - timestamp: + timestamp: parse_from: attributes.time_local layout: '%Y-%m-%d %H:%M:%S' -exporters: + exporters: # HTTP setup otlphttp/hdx: endpoint: 'http://localhost:4318' headers: authorization: compression: gzip - + # gRPC setup (alternative) otlp/hdx: endpoint: 'localhost:4317' diff --git a/docs/use-cases/observability/clickstack/ingesting-data/overview.md b/docs/use-cases/observability/clickstack/ingesting-data/overview.md index e3b7089c5a2..c7191a57d31 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/overview.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/overview.md @@ -20,16 +20,14 @@ This collector exposes two OTLP endpoints: - **HTTP** - port `4318` - **gRPC** - port `4317` -Users can send data to these endpoints either directly from [language SDKs](/use-cases/observability/clickstack/sdks) or OTel-compatible data collection agents e.g. other OTel collectors collecting infrastructure metrics and logs. + Users can send data to these endpoints either directly from [language SDKs](/use-cases/observability/clickstack/sdks) or OTel-compatible data collection agents e.g. other OTel collectors collecting infrastructure metrics and logs. -More specifically: + More specifically: -- [**Language SDKs**](/use-cases/observability/clickstack/sdks) are responsible for collecting telemetry from within your application - most notably **traces** and **logs** - and exporting this data to the OpenTelemetry collector, via the OTLP endpoint, which handles ingestion into ClickHouse. For more details on the language SDKs available with ClickStack see [SDKs](/use-cases/observability/clickstack/sdks). +- [**Language SDKs**](/use-cases/observability/clickstack/sdks) are responsible for collecting telemetry from within your application - most notably **traces** and **logs** - and exporting this data to the OpenTelemetry collector, via the OTLP endpoint, which handles ingestion into ClickHouse. For more details on the language SDKs available with ClickStack see [SDKs](/use-cases/observability/clickstack/sdks). - **Data collection agents** are agents deployed at the edge — on servers, Kubernetes nodes, or alongside applications. They collect infrastructure telemetry (e.g. logs, metrics) or receive events directly from applications instrumented with SDKs. In this case, the agent runs on the same host as the application, often as a sidecar or DaemonSet. These agents forward data to the central ClickStack OTel collector, which acts as a [gateway](/use-cases/observability/clickstack/ingesting-data/otel-collector#collector-roles), typically deployed once per cluster, data center, or region. The [gateway](/use-cases/observability/clickstack/ingesting-data/otel-collector#collector-roles) receives OTLP events from agents or applications and handles ingestion into ClickHouse. See [OTel collector](/use-cases/observability/clickstack/ingesting-data/otel-collector) for more details. These agents can be other instances of the OTel collector or alternative technologies such as [Fluentd](https://www.fluentd.org/) or [Vector](https://vector.dev/). -:::note OpenTelemetry compatibility -While ClickStack offers its own language SDKs and a custom OpenTelemetry, with enhanced telemetry and features, users can also use their existing OpenTelemetry SDKs and agents seamlessly. -::: - - + :::note OpenTelemetry compatibility + While ClickStack offers its own language SDKs and a custom OpenTelemetry, with enhanced telemetry and features, users can also use their existing OpenTelemetry SDKs and agents seamlessly. + ::: diff --git a/docs/use-cases/observability/clickstack/ingesting-data/sdks/aws-lambda.md b/docs/use-cases/observability/clickstack/ingesting-data/sdks/aws-lambda.md index da361e27905..ad0db78d8b4 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/sdks/aws-lambda.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/sdks/aws-lambda.md @@ -10,7 +10,6 @@ title: 'AWS Lambda' import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; - **This guide Integrates:** @@ -32,7 +31,7 @@ The OpenTelemetry project provides separate lambda layers to: ### Adding the language-specific auto-instrumentation layer {#adding-language-specific-auto-instrumentation} -The language-specific auto-instrumentation lambda layers automatically instrument your Lambda function code with OpenTelemetry auto-instrumentation package for your specific language. +The language-specific auto-instrumentation lambda layers automatically instrument your Lambda function code with OpenTelemetry auto-instrumentation package for your specific language. Each language and region has its own layer ARN. @@ -45,92 +44,69 @@ If your Lambda is already instrumented with an OpenTelemetry SDK, you can skip t - ```shell arn:aws:lambda::184161586896:layer:opentelemetry-nodejs-0_7_0:1 ``` - - ```shell copy arn:aws:lambda::184161586896:layer:opentelemetry-python-0_7_0:1 ``` - - - ```shell copy arn:aws:lambda::184161586896:layer:opentelemetry-javaagent-0_6_0:1 ``` - - - ```shell copy arn:aws:lambda::184161586896:layer:opentelemetry-ruby-0_1_0:1 ``` - - -_The latest releases of the layers can be found in the [OpenTelemetry Lambda Layers GitHub repository](https://github.com/open-telemetry/opentelemetry-lambda/releases)._ + _The latest releases of the layers can be found in the [OpenTelemetry Lambda Layers GitHub repository](https://github.com/open-telemetry/opentelemetry-lambda/releases)._ 3. Configure the following environment variables in your Lambda function under "Configuration" > "Environment variables". - ```shell OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 AWS_LAMBDA_EXEC_WRAPPER=/opt/otel-handler OTEL_PROPAGATORS=tracecontext OTEL_TRACES_SAMPLER=always_on ``` - - ```shell OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 AWS_LAMBDA_EXEC_WRAPPER=/opt/otel-instrument OTEL_PROPAGATORS=tracecontext OTEL_TRACES_SAMPLER=always_on ``` - - - ```shell OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 AWS_LAMBDA_EXEC_WRAPPER=/opt/otel-handler OTEL_PROPAGATORS=tracecontext OTEL_TRACES_SAMPLER=always_on ``` - - - ```shell OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 AWS_LAMBDA_EXEC_WRAPPER=/opt/otel-handler OTEL_PROPAGATORS=tracecontext OTEL_TRACES_SAMPLER=always_on ``` - - - ### Installing the OpenTelemetry collector Lambda layer {#installing-the-otel-collector-layer} -The collector Lambda layer allows you to forward logs, metrics, and traces from your Lambda function to ClickStack without impacting response times due +The collector Lambda layer allows you to forward logs, metrics, and traces from your Lambda function to ClickStack without impacting response times due to exporter latency. **To install the collector layer**: @@ -138,28 +114,19 @@ to exporter latency. 1. In the Layers section click "Add a layer" 2. Select specify an ARN and choose the correct ARN based on architecture, ensure you replace the `` with your region (ex. `us-east-2`): - - - ```shell arn:aws:lambda::184161586896:layer:opentelemetry-collector-amd64-0_8_0:1 ``` - - - ```shell arn:aws:lambda::184161586896:layer:opentelemetry-collector-arm64-0_8_0:1 ``` - - - 3. Add the following `collector.yaml` file to your project to configure the collector to send to ClickStack: ```yaml @@ -201,16 +168,16 @@ service: 4. Add the following environment variable: -```shell -OPENTELEMETRY_COLLECTOR_CONFIG_FILE=/var/task/collector.yaml -``` + ```shell + OPENTELEMETRY_COLLECTOR_CONFIG_FILE=/var/task/collector.yaml + ``` ## Checking the installation {#checking-the-installation} After deploying the layers, you should now see traces automatically -collected from your Lambda function in HyperDX. The `decouple` and `batching` -processor may introduce a delay in telemetry collection, so traces may be -delayed in showing up. To emit custom logs or metrics, you'll need to instrument your code your language-specific +collected from your Lambda function in HyperDX. The `decouple` and `batching` +processor may introduce a delay in telemetry collection, so traces may be +delayed in showing up. To emit custom logs or metrics, you'll need to instrument your code your language-specific OpenTelemetry SDKs. ## Troubleshooting {#troubleshoting} @@ -219,7 +186,7 @@ OpenTelemetry SDKs. If you're not seeing your manually defined traces or other telemetry, you may be using an incompatible version of the OpenTelemetry API package. Ensure your -OpenTelemetry API package is at least the same or lower version than the +OpenTelemetry API package is at least the same or lower version than the version included in the AWS lambda. ### Enabling SDK debug logs {#enabling-sdk-debug-logs} @@ -231,7 +198,7 @@ is correctly instrumenting your application. ### Enabling collector debug logs {#enabling-collector-debug-logs} To debug collector issues, you can enable debug logs by modifying your collector -configuration file to add the `logging` exporter and setting the telemetry +configuration file to add the `logging` exporter and setting the telemetry log level to `debug` to enable more verbose logging from the collector lambda layer. ```yaml diff --git a/docs/use-cases/observability/clickstack/ingesting-data/sdks/browser.md b/docs/use-cases/observability/clickstack/ingesting-data/sdks/browser.md index dd73220edab..d61e2913145 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/sdks/browser.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/sdks/browser.md @@ -11,7 +11,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; The ClickStack browser SDK allows you to instrument your frontend application to -send events to ClickStack. This allows you to view network +send events to ClickStack. This allows you to view network requests and exceptions alongside backend events in a single timeline. Additionally, it'll automatically capture and correlate session replay data, so @@ -31,74 +31,61 @@ This guide integrates the following: - **Install via package import (Recommended)** - Use the following command to install the [browser package](https://www.npmjs.com/package/@hyperdx/browser). - ```shell npm install @hyperdx/browser ``` - **Initialize ClickStack** - ```javascript import HyperDX from '@hyperdx/browser'; - HyperDX.init({ - url: 'http://localhost:4318', - apiKey: 'YOUR_INGESTION_API_KEY', - service: 'my-frontend-app', - tracePropagationTargets: [/api.myapp.domain/i], // Set to link traces from frontend to backend requests - consoleCapture: true, // Capture console logs (default false) - advancedNetworkCapture: true, // Capture full HTTP request/response headers and bodies (default false) +url: 'http://localhost:4318', +apiKey: 'YOUR_INGESTION_API_KEY', +service: 'my-frontend-app', +tracePropagationTargets: [/api.myapp.domain/i], // Set to link traces from frontend to backend requests +consoleCapture: true, // Capture console logs (default false) +advancedNetworkCapture: true, // Capture full HTTP request/response headers and bodies (default false) }); ``` - - **Install via Script Tag (Alternative)** - You can also include and install the script via a script tag as opposed to installing via NPM. This will expose the `HyperDX` global variable and can be used in the same way as the NPM package. - This is recommended if your site is not currently built using a bundler. - ```html ``` - - ### Options {#options} - `apiKey` - Your ClickStack Ingestion API Key. - `service` - The service name events will show up as in HyperDX UI. - `tracePropagationTargets` - A list of regex patterns to match against HTTP - requests to link frontend and backend traces, it will add an additional - `traceparent` header to all requests matching any of the patterns. This should - be set to your backend API domain (ex. `api.yoursite.com`). + requests to link frontend and backend traces, it will add an additional + `traceparent` header to all requests matching any of the patterns. This should + be set to your backend API domain (ex. `api.yoursite.com`). - `consoleCapture` - (Optional) Capture all console logs (default `false`). - `advancedNetworkCapture` - (Optional) Capture full request/response headers - and bodies (default false). + and bodies (default false). - `url` - (Optional) The OpenTelemetry collector URL, only needed for - self-hosted instances. + self-hosted instances. - `maskAllInputs` - (Optional) Whether to mask all input fields in session - replay (default `false`). + replay (default `false`). - `maskAllText` - (Optional) Whether to mask all text in session replay (default - `false`). + `false`). - `disableIntercom` - (Optional) Whether to disable Intercom integration (default `false`) - `disableReplay` - (Optional) Whether to disable session replay (default `false`) @@ -128,7 +115,7 @@ HyperDX.setGlobalAttributes({ ### Auto capture React error boundary errors {#auto-capture-react-error-boundary-errors} If you're using React, you can automatically capture errors that occur within -React error boundaries by passing your error boundary component +React error boundaries by passing your error boundary component into the `attachToReactErrorBoundary` function. ```javascript diff --git a/docs/use-cases/observability/clickstack/ingesting-data/sdks/deno.md b/docs/use-cases/observability/clickstack/ingesting-data/sdks/deno.md index e82b48a4f36..40cbb391756 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/sdks/deno.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/sdks/deno.md @@ -11,9 +11,9 @@ This guide Integrates the following: - **Logs** -:::note -Currently only supports OpenTelemetry Logging. For tracing support, [see the following guide](https://dev.to/grunet/leveraging-opentelemetry-in-deno-45bj#a-minimal-interesting-example). -::: + :::note + Currently only supports OpenTelemetry Logging. For tracing support, [see the following guide](https://dev.to/grunet/leveraging-opentelemetry-in-deno-45bj#a-minimal-interesting-example). + ::: ## Logging {#logging} diff --git a/docs/use-cases/observability/clickstack/ingesting-data/sdks/golang.md b/docs/use-cases/observability/clickstack/ingesting-data/sdks/golang.md index 9711a330433..f2ca0833ef5 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/sdks/golang.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/sdks/golang.md @@ -144,7 +144,6 @@ func ExampleHandler(w http.ResponseWriter, r *http.Request) { } ``` - ### Gin application example {#gin-application-example} For this example, we will be using `gin-gonic/gin`. @@ -229,7 +228,6 @@ func main() { } ``` - ### Configure environment variables {#configure-environment-variables} Afterwards you'll need to configure the following environment variables in your shell to ship telemetry to ClickStack: diff --git a/docs/use-cases/observability/clickstack/ingesting-data/sdks/index.md b/docs/use-cases/observability/clickstack/ingesting-data/sdks/index.md index 9dc65b41ae2..363b149a067 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/sdks/index.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/sdks/index.md @@ -10,7 +10,7 @@ Users typically send data to ClickStack via the **OpenTelemetry (OTel) collector Language SDKs are responsible for collecting telemetry from within your application - most notably **traces** and **logs** - and exporting this data to the OpenTelemetry collector, via the OTLP endpoint, which handles ingestion into ClickHouse. -In browser-based environments, SDKs may also be responsible for collecting **session data**, including UI events, clicks, and navigation thus enabling replays of user sessions. +In browser-based environments, SDKs may also be responsible for collecting **session data**, including UI events, clicks, and navigation thus enabling replays of user sessions. ## How it works {#how-it-works} @@ -65,4 +65,4 @@ All SDKs support automatic correlation with Kubernetes metadata (pod name, names - Correlate application logs and traces with infrastructure metrics - Track resource usage and performance across your Kubernetes cluster -To enable this feature, configure the OpenTelemetry collector to forward resource tags to pods. See the [Kubernetes integration guide](/use-cases/observability/clickstack/ingesting-data/kubernetes#forwarding-resouce-tags-to-pods) for detailed setup instructions. + To enable this feature, configure the OpenTelemetry collector to forward resource tags to pods. See the [Kubernetes integration guide](/use-cases/observability/clickstack/ingesting-data/kubernetes#forwarding-resouce-tags-to-pods) for detailed setup instructions. diff --git a/docs/use-cases/observability/clickstack/ingesting-data/sdks/nextjs.md b/docs/use-cases/observability/clickstack/ingesting-data/sdks/nextjs.md index 38e83a6ec8d..62a50b5a82a 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/sdks/nextjs.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/sdks/nextjs.md @@ -19,9 +19,9 @@ This Guide Integrates: - **Console Logs** - **Traces** -:::note -If you're looking for session replay/browser-side monitoring, you'll want to install the [Browser integration](/use-cases/observability/clickstack/sdks/browser) instead. -::: + :::note + If you're looking for session replay/browser-side monitoring, you'll want to install the [Browser integration](/use-cases/observability/clickstack/sdks/browser) instead. + ::: ## Installing {#installing} @@ -36,7 +36,7 @@ const nextConfig = { experimental: { instrumentationHook: true, }, - // Ignore otel pkgs warnings + // Ignore otel pkgs warnings // https://github.com/open-telemetry/opentelemetry-js/issues/4173#issuecomment-1822938936 webpack: ( config, @@ -56,18 +56,14 @@ module.exports = nextConfig; - -```shell -npm install @hyperdx/node-opentelemetry +```shell +npm install @hyperdx/node-opentelemetry ``` - - -```shell -yarn add @hyperdx/node-opentelemetry +```shell +yarn add @hyperdx/node-opentelemetry ``` - @@ -90,7 +86,6 @@ export async function register() { This will allow Next.js to import the OpenTelemetry instrumentation for any serverless function invocation. - ### Configure environment variables {#configure-environment-variables} If you're sending traces directly to ClickStack, you'll need to start your Next.js diff --git a/docs/use-cases/observability/clickstack/ingesting-data/sdks/nodejs.md b/docs/use-cases/observability/clickstack/ingesting-data/sdks/nodejs.md index ad8c5612dd9..9d743a3ea82 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/sdks/nodejs.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/sdks/nodejs.md @@ -29,18 +29,14 @@ Use the following command to install the [ClickStack OpenTelemetry package](http - -```shell -npm install @hyperdx/node-opentelemetry +```shell +npm install @hyperdx/node-opentelemetry ``` - - -```shell -yarn add @hyperdx/node-opentelemetry +```shell +yarn add @hyperdx/node-opentelemetry ``` - @@ -50,28 +46,22 @@ To initialize the SDK, you'll need to call the `init` function at the top of the - ```javascript const HyperDX = require('@hyperdx/node-opentelemetry'); - HyperDX.init({ - apiKey: 'YOUR_INGESTION_API_KEY', - service: 'my-service' +apiKey: 'YOUR_INGESTION_API_KEY', +service: 'my-service' }); ``` - - ```javascript import * as HyperDX from '@hyperdx/node-opentelemetry'; - HyperDX.init({ - apiKey: 'YOUR_INGESTION_API_KEY', - service: 'my-service' +apiKey: 'YOUR_INGESTION_API_KEY', +service: 'my-service' }); ``` - @@ -87,125 +77,96 @@ integrations if applicable (such as [Kubernetes](/use-cases/observability/clicks - If you're using `winston` as your logger, you'll need to add the following transport to your logger. - ```typescript - import winston from 'winston'; - import * as HyperDX from '@hyperdx/node-opentelemetry'; - - const logger = winston.createLogger({ - level: 'info', - format: winston.format.json(), - transports: [ - new winston.transports.Console(), - HyperDX.getWinstonTransport('info', { // Send logs info and above - detectResources: true, - }), - ], - }); - - export default logger; +import winston from 'winston'; +import * as HyperDX from '@hyperdx/node-opentelemetry'; +const logger = winston.createLogger({ +level: 'info', +format: winston.format.json(), +transports: [ +new winston.transports.Console(), +HyperDX.getWinstonTransport('info', { // Send logs info and above +detectResources: true, +}), +], +}); +export default logger; ``` - - If you're using `pino` as your logger, you'll need to add the following transport to your logger and specify a `mixin` to correlate logs with traces. - ```typescript import pino from 'pino'; import * as HyperDX from '@hyperdx/node-opentelemetry'; - const logger = pino( - pino.transport({ - mixin: HyperDX.getPinoMixinFunction, - targets: [ - HyperDX.getPinoTransport('info', { // Send logs info and above - detectResources: true, - }), - ], - }), +pino.transport({ +mixin: HyperDX.getPinoMixinFunction, +targets: [ +HyperDX.getPinoTransport('info', { // Send logs info and above +detectResources: true, +}), +], +}), ); - export default logger; ``` - - -By default, `console.*` methods are supported out of the box. No additional configuration is required. - +By default, `console.*` methods are supported out of the box. No additional configuration is required. You can disable this by setting the `HDX_NODE_CONSOLE_CAPTURE` environment variable to 0 or by passing `consoleCapture: false` to the `init` function. - ### Setup error collection {#setup-error-collection} -The ClickStack SDK can automatically capture uncaught exceptions and errors in your application with full stack trace and code context. +The ClickStack SDK can automatically capture uncaught exceptions and errors in your application with full stack trace and code context. To enable this, you'll need to add the following code to the end of your application's error handling middleware, or manually capture exceptions using the `recordException` function. - - -```javascript +```javascript const HyperDX = require('@hyperdx/node-opentelemetry'); HyperDX.init({ - apiKey: 'YOUR_INGESTION_API_KEY', - service: 'my-service' +apiKey: 'YOUR_INGESTION_API_KEY', +service: 'my-service' }); const app = express(); - // Add your routes, etc. - // Add this after all routes, // but before any and other error-handling middlewares are defined HyperDX.setupExpressErrorHandler(app); - app.listen(3000); ``` - - -```javascript +```javascript const Koa = require("koa"); const Router = require("@koa/router"); const HyperDX = require('@hyperdx/node-opentelemetry'); HyperDX.init({ - apiKey: 'YOUR_INGESTION_API_KEY', - service: 'my-service' +apiKey: 'YOUR_INGESTION_API_KEY', +service: 'my-service' }); - const router = new Router(); const app = new Koa(); - HyperDX.setupKoaErrorHandler(app); - // Add your routes, etc. - app.listen(3030); ``` - - ```javascript const HyperDX = require('@hyperdx/node-opentelemetry'); - function myErrorHandler(error, req, res, next) { - // This can be used anywhere in your application - HyperDX.recordException(error); +// This can be used anywhere in your application +HyperDX.recordException(error); } ``` - - ## Troubleshooting {#troubleshooting} If you're having trouble with the SDK, you can enable verbose logging by setting @@ -312,38 +273,28 @@ The following libraries will be automatically instrumented (traced) by the SDK: Alternatively, you can auto-instrument your application without any code changes by using the `opentelemetry-instrument` CLI or using the Node.js `--require` flag. The CLI installation exposes a wider range of auto-instrumented libraries and frameworks. - - ```shell HYPERDX_API_KEY='' OTEL_SERVICE_NAME='' npx opentelemetry-instrument index.js ``` - - ```shell HYPERDX_API_KEY='' OTEL_SERVICE_NAME='' ts-node -r '@hyperdx/node-opentelemetry/build/src/tracing' index.js ``` - - - -```javascript +```javascript // Import this at the very top of the first file loaded in your application // You'll still specify your API key via the `HYPERDX_API_KEY` environment variable import { initSDK } from '@hyperdx/node-opentelemetry'; - initSDK({ - consoleCapture: true, // optional, default: true - additionalInstrumentations: [], // optional, default: [] +consoleCapture: true, // optional, default: true +additionalInstrumentations: [], // optional, default: [] }); ``` - - _The `OTEL_SERVICE_NAME` environment variable is used to identify your service in the HyperDX app, it can be any name you want._ diff --git a/docs/use-cases/observability/clickstack/ingesting-data/sdks/python.md b/docs/use-cases/observability/clickstack/ingesting-data/sdks/python.md index 2d64351bf47..92d0e7075cd 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/sdks/python.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/sdks/python.md @@ -10,7 +10,6 @@ title: 'Python' import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; - ClickStack uses the OpenTelemetry standard for collecting telemetry data (logs and traces). Traces are auto-generated with automatic instrumentation, so manual instrumentation isn't required to get value out of tracing. @@ -45,7 +44,7 @@ Afterwards you'll need to configure the following environment variables in your ```shell export HYPERDX_API_KEY='' \ OTEL_SERVICE_NAME='' \ -OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 ``` _The `OTEL_SERVICE_NAME` environment variable is used to identify your service in the HyperDX app, it can be any name you want._ @@ -60,41 +59,31 @@ opentelemetry-instrument python app.py #### If you are using `Gunicorn`, `uWSGI` or `uvicorn` {#using-uvicorn-gunicorn-uwsgi} -In this case, the OpenTelemetry Python agent will require additional changes to work. +In this case, the OpenTelemetry Python agent will require additional changes to work. To configure OpenTelemetry for application servers using the pre-fork web server mode, make sure to call the `configure_opentelemetry` method within the post-fork hook. - - ```python from hyperdx.opentelemetry import configure_opentelemetry - def post_fork(server, worker): - configure_opentelemetry() +configure_opentelemetry() ``` - ```python from hyperdx.opentelemetry import configure_opentelemetry from uwsgidecorators import postfork - @postfork def init_tracing(): - configure_opentelemetry() +configure_opentelemetry() ``` - - - -OpenTelemetry [currently does not work](https://github.com/open-telemetry/opentelemetry-python-contrib/issues/385) with `uvicorn` run using the `--reload` +OpenTelemetry [currently does not work](https://github.com/open-telemetry/opentelemetry-python-contrib/issues/385) with `uvicorn` run using the `--reload` flag or with multi-workers (`--workers`). We recommend disabling those flags while testing, or using Gunicorn. - - ## Advanced configuration {#advanced-configuration} @@ -139,5 +128,3 @@ export DEBUG=true Read more about Python OpenTelemetry instrumentation here: [https://opentelemetry.io/docs/instrumentation/python/manual/](https://opentelemetry.io/docs/instrumentation/python/manual/) - - diff --git a/docs/use-cases/observability/clickstack/ingesting-data/sdks/ruby.md b/docs/use-cases/observability/clickstack/ingesting-data/sdks/ruby.md index da880c9dd2b..1bd4fa7cc37 100644 --- a/docs/use-cases/observability/clickstack/ingesting-data/sdks/ruby.md +++ b/docs/use-cases/observability/clickstack/ingesting-data/sdks/ruby.md @@ -19,7 +19,6 @@ This guide integrates:
    - _To send logs to ClickStack, please send logs via the [OpenTelemetry collector](/use-cases/observability/clickstack/ingesting-data/otel-collector)._ ## Getting started {#getting-started} diff --git a/docs/use-cases/observability/clickstack/migration/elastic/concepts.md b/docs/use-cases/observability/clickstack/migration/elastic/concepts.md index 4f97360fb90..7fa715e3372 100644 --- a/docs/use-cases/observability/clickstack/migration/elastic/concepts.md +++ b/docs/use-cases/observability/clickstack/migration/elastic/concepts.md @@ -26,20 +26,20 @@ Both Elastic Stack and ClickStack cover the core roles of an observability platf - **Storage and Query Engine**: the backend systems responsible for storing observability data and serving analytical queries. - **Data Collection and ETL**: agents and pipelines that gather telemetry data and process it before ingestion. -The table below outlines how each stack maps its components to these roles: + The table below outlines how each stack maps its components to these roles: -| **Role** | **Elastic Stack** | **ClickStack** | **Comments** | -|--------------------------|--------------------------------------------------|--------------------------------------------------|--------------| -| **UI & Alerting** | **Kibana** — dashboards, search, and alerts | **HyperDX** — real-time UI, search, and alerts | Both serve as the primary interface for users, including visualizations and alert management. HyperDX is purpose-built for observability and tightly coupled to OpenTelemetry semantics. | -| **Storage & Query Engine** | **Elasticsearch** — JSON document store with inverted index | **ClickHouse** — column-oriented database with vectorized engine | Elasticsearch uses an inverted index optimized for search; ClickHouse uses columnar storage and SQL for high-speed analytics over structured and semi-structured data. | -| **Data Collection** | **Elastic Agent**, **Beats** (e.g. Filebeat, Metricbeat) | **OpenTelemetry Collector** (edge + gateway) | Elastic supports custom shippers and a unified agent managed by Fleet. ClickStack relies on OpenTelemetry, allowing vendor-neutral data collection and processing. | -| **Instrumentation SDKs** | **Elastic APM agents** (proprietary) | **OpenTelemetry SDKs** (distributed by ClickStack) | Elastic SDKs are tied to the Elastic stack. ClickStack builds on OpenTelemetry SDKs for logs, metrics, and traces in major languages. | -| **ETL / Data Processing** | **Logstash**, ingest pipelines | **OpenTelemetry Collector** + ClickHouse materialized views | Elastic uses ingest pipelines and Logstash for transformation. ClickStack shifts compute to insert time via materialized views and OTel collector processors, which transform data efficiently and incrementally. | -| **Architecture Philosophy** | Vertically integrated, proprietary agents and formats | Open standard–based, loosely coupled components | Elastic builds a tightly integrated ecosystem. ClickStack emphasizes modularity and standards (OpenTelemetry, SQL, object storage) for flexibility and cost-efficiency. | + | **Role** | **Elastic Stack** | **ClickStack** | **Comments** | + |--------------------------|--------------------------------------------------|--------------------------------------------------|--------------| + | **UI & Alerting** | **Kibana** — dashboards, search, and alerts | **HyperDX** — real-time UI, search, and alerts | Both serve as the primary interface for users, including visualizations and alert management. HyperDX is purpose-built for observability and tightly coupled to OpenTelemetry semantics. | + | **Storage & Query Engine** | **Elasticsearch** — JSON document store with inverted index | **ClickHouse** — column-oriented database with vectorized engine | Elasticsearch uses an inverted index optimized for search; ClickHouse uses columnar storage and SQL for high-speed analytics over structured and semi-structured data. | + | **Data Collection** | **Elastic Agent**, **Beats** (e.g. Filebeat, Metricbeat) | **OpenTelemetry Collector** (edge + gateway) | Elastic supports custom shippers and a unified agent managed by Fleet. ClickStack relies on OpenTelemetry, allowing vendor-neutral data collection and processing. | + | **Instrumentation SDKs** | **Elastic APM agents** (proprietary) | **OpenTelemetry SDKs** (distributed by ClickStack) | Elastic SDKs are tied to the Elastic stack. ClickStack builds on OpenTelemetry SDKs for logs, metrics, and traces in major languages. | + | **ETL / Data Processing** | **Logstash**, ingest pipelines | **OpenTelemetry Collector** + ClickHouse materialized views | Elastic uses ingest pipelines and Logstash for transformation. ClickStack shifts compute to insert time via materialized views and OTel collector processors, which transform data efficiently and incrementally. | + | **Architecture Philosophy** | Vertically integrated, proprietary agents and formats | Open standard–based, loosely coupled components | Elastic builds a tightly integrated ecosystem. ClickStack emphasizes modularity and standards (OpenTelemetry, SQL, object storage) for flexibility and cost-efficiency. | -ClickStack emphasizes open standards and interoperability, being fully OpenTelemetry-native from collection to UI. In contrast, Elastic provides a tightly coupled but more vertically integrated ecosystem with proprietary agents and formats. + ClickStack emphasizes open standards and interoperability, being fully OpenTelemetry-native from collection to UI. In contrast, Elastic provides a tightly coupled but more vertically integrated ecosystem with proprietary agents and formats. -Given that **Elasticsearch** and **ClickHouse** are the core engines responsible for data storage, processing, and querying in their respective stacks, understanding how they differ is essential. These systems underpin the performance, scalability, and flexibility of the entire observability architecture. The following section explores the key differences between Elasticsearch and ClickHouse - including how they model data, handle ingestion, execute queries, and manage storage. + Given that **Elasticsearch** and **ClickHouse** are the core engines responsible for data storage, processing, and querying in their respective stacks, understanding how they differ is essential. These systems underpin the performance, scalability, and flexibility of the entire observability architecture. The following section explores the key differences between Elasticsearch and ClickHouse - including how they model data, handle ingestion, execute queries, and manage storage. ## Elasticsearch vs ClickHouse {#elasticsearch-vs-clickhouse} @@ -69,7 +69,6 @@ Elasticsearch uses ingest pipelines with processors (e.g., `enrich`, `rename`, ` For enrichment, Elasticsearch supports dedicated [enrich processors](https://www.elastic.co/docs/reference/enrich-processor/enrich-processor) to add context to documents. In ClickHouse, [**dictionaries**](/dictionary) can be used at both [query time](/dictionary#query-time-enrichment) and [ingest time](/dictionary#index-time-enrichment) to enrich rows - for example, to [map IPs to locations](/use-cases/observability/schema-design#using-ip-dictionaries) or apply [user agent lookups](/use-cases/observability/schema-design#using-regex-dictionaries-user-agent-parsing) on insert. - ### Query languages {#query-languages} Elasticsearch supports a [number of query languages](https://www.elastic.co/docs/explore-analyze/query-filter/languages) including [DSL](https://www.elastic.co/docs/explore-analyze/query-filter/languages/querydsl), [ES|QL](https://www.elastic.co/docs/explore-analyze/query-filter/languages/esql), [EQL](https://www.elastic.co/docs/explore-analyze/query-filter/languages/eql) and [KQL](https://www.elastic.co/docs/explore-analyze/query-filter/languages/kql) (Lucene style) queries, but has limited support for joins — only **left outer joins** are available via [`ES|QL`](https://www.elastic.co/guide/en/elasticsearch/reference/8.x/esql-commands.html#esql-lookup-join). ClickHouse supports **full SQL syntax**, including [all join types](/sql-reference/statements/select/join#supported-types-of-join), [window functions](/sql-reference/window-functions), subqueries (and correlated), and CTEs. This is a major advantage for users needing to correlate between observability signals and business or infrastructure data. @@ -94,13 +93,13 @@ The concept of sharding is fundamental to Elasticsearch's scalability model. Eac Elasticsearch recommends sizing shards to around [50 GB or 200 million documents](https://www.elastic.co/docs/deploy-manage/production-guidance/optimize-performance/size-shards) due to [JVM heap and metadata overhead](https://www.elastic.co/docs/deploy-manage/production-guidance/optimize-performance/size-shards#each-shard-has-overhead). There's also a hard limit of [2 billion documents per shard](https://www.elastic.co/docs/deploy-manage/production-guidance/optimize-performance/size-shards#troubleshooting-max-docs-limit). Elasticsearch parallelizes queries across shards, but each shard is processed using a **single thread**, making over-sharding both costly and counterproductive. This inherently tightly couples sharding to scaling, with more shards (and nodes) required to scale performance. -Elasticsearch indexes all fields into [**inverted indices**](https://www.elastic.co/docs/manage-data/data-store/index-basics) for fast search, optionally using [**doc values**](https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/doc-values) for aggregations, sorting and scripted field access. Numeric and geo fields use [Block K-D trees](https://users.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf) for searches on geospatial data and numeric and date ranges. +Elasticsearch indexes all fields into [**inverted indices**](https://www.elastic.co/docs/manage-data/data-store/index-basics) for fast search, optionally using [**doc values**](https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/doc-values) for aggregations, sorting and scripted field access. Numeric and geo fields use [Block K-D trees](https://users.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf) for searches on geospatial data and numeric and date ranges. Importantly, Elasticsearch stores the full original document in [`_source`](https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/mapping-source-field) (compressed with `LZ4`, `Deflate` or `ZSTD`), while ClickHouse does not store a separate document representation. Data is reconstructed from columns at query time, saving storage space. This same capability is possible for Elasticsearch using [Synthetic `_source`](https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/mapping-source-field#synthetic-source), with some [restrictions](https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/mapping-source-field#synthetic-source-restrictions). Disabling of `_source` also has [implications](https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/mapping-source-field#include-exclude) which don't apply to ClickHouse. In Elasticsearch, [index mappings](https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html) (equivalent to table schemas in ClickHouse) control the type of fields and the data structures used for this persistence and querying. -ClickHouse, by contrast, is **column-oriented** — every column is stored independently but always sorted by the table's primary/ordering key. This ordering enables [sparse primary indexes](/primary-indexes), which allow ClickHouse to skip over data during query execution efficiently. When queries filter by primary key fields, ClickHouse reads only the relevant parts of each column, significantly reducing disk I/O and improving performance — even without a full index on every column. +ClickHouse, by contrast, is **column-oriented** — every column is stored independently but always sorted by the table's primary/ordering key. This ordering enables [sparse primary indexes](/primary-indexes), which allow ClickHouse to skip over data during query execution efficiently. When queries filter by primary key fields, ClickHouse reads only the relevant parts of each column, significantly reducing disk I/O and improving performance — even without a full index on every column. ClickHouse @@ -108,7 +107,7 @@ ClickHouse also supports [**skip indexes**](/optimize/skipping-indexes), which a ClickHouse also supports sharding, but its model is designed to favor **vertical scaling**. A single shard can store **trillions of rows** and continues to perform efficiently as long as memory, CPU, and disk permit. Unlike Elasticsearch, there is **no hard row limit** per shard. Shards in ClickHouse are logical — effectively individual tables — and do not require partitioning unless the dataset exceeds the capacity of a single node. This typically occurs due to disk size constraints, with sharding ① introduced only when horizontal scale-out is necessary - reducing complexity and overhead. In this case, similar to Elasticsearch, a shard will hold a subset of the data. The data within a single shard is organized as a collection of ② immutable data parts containing ③ several data structures. -Processing within a ClickHouse shard is **fully parallelized**, and users are encouraged to scale vertically to avoid the network costs associated with moving data across nodes. +Processing within a ClickHouse shard is **fully parallelized**, and users are encouraged to scale vertically to avoid the network costs associated with moving data across nodes. :::note Insert processing in ClickHouse Inserts in ClickHouse are **synchronous by default** — the write is acknowledged only after commit — but can be configured for **asynchronous inserts** to match Elastic-like buffering and batching. If [asynchronous data inserts](https://clickhouse.com/blog/asynchronous-data-inserts-in-clickhouse) are used, Ⓐ newly inserted rows first go into an Ⓑ in-memory insert buffer that is flushed by default once every 200 milliseconds. If multiple shards are used, a [distributed table](/engines/table-engines/special/distributed) is used for routing newly inserted rows to their target shard. A new part is written for the shard on disk. @@ -131,7 +130,7 @@ In summary: - **Elastic**: Shards are physical Lucene structures tied to JVM memory. Over-sharding introduces performance penalties. Replication is synchronous and coordinated by a master node. - **ClickHouse**: Shards are logical and vertically scalable, with highly efficient local execution. Replication is asynchronous (but can be sequential), and coordination is lightweight. -Ultimately, ClickHouse favors simplicity and performance at scale by minimizing the need for shard tuning while still offering strong consistency guarantees when needed. + Ultimately, ClickHouse favors simplicity and performance at scale by minimizing the need for shard tuning while still offering strong consistency guarantees when needed. ### Deduplication and routing {#deduplication-and-routing} @@ -161,7 +160,7 @@ ClickHouse imposes no size limits. You can perform unbounded group-by queries ac The above differences can be attributed to the execution models of Elasticsearch and ClickHouse, which take fundamentally different approaches to query execution and parallelism. -ClickHouse was designed to maximize efficiency on modern hardware. By default, ClickHouse runs a SQL query with N concurrent execution lanes on a machine with N CPU cores: +ClickHouse was designed to maximize efficiency on modern hardware. By default, ClickHouse runs a SQL query with N concurrent execution lanes on a machine with N CPU cores: ClickHouse execution @@ -170,16 +169,15 @@ On a single node, execution lanes split data into independent ranges allowing co Query execution is further parallelized by: 1. **SIMD vectorization**: operations on columnar data use [CPU SIMD instructions](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) (e.g., [AVX512](https://en.wikipedia.org/wiki/AVX-512)), allowing batch processing of values. 2. **Cluster-level parallelism**: in distributed setups, each node performs query processing locally. [Partial aggregation states](https://clickhouse.com/blog/aggregate-functions-combinators-in-clickhouse-for-arrays-maps-and-states#working-with-aggregation-states) are streamed to the initiating node and merged. If the query's `GROUP BY` keys align with the [sharding keys](/architecture/horizontal-scaling#shard), merging can be [minimized or avoided entirely](/operations/settings/settings#distributed_group_by_no_merge). -
    -This model enables efficient scaling across cores and nodes, making ClickHouse well-suited for large-scale analytics. The use of *partial aggregation states* allows intermediate results from different threads and nodes to be merged without loss of accuracy. + This model enables efficient scaling across cores and nodes, making ClickHouse well-suited for large-scale analytics. The use of *partial aggregation states* allows intermediate results from different threads and nodes to be merged without loss of accuracy. -Elasticsearch, by contrast, assigns one thread per shard for most aggregations, regardless of how many CPU cores are available. These threads return shard-local top-N results, which are merged at the coordinating node. This approach can underutilize system resources and introduce potential inaccuracies in global aggregations, particularly when frequent terms are distributed across multiple shards. Accuracy can be improved by increasing the `shard_size` parameter, but this comes at the cost of higher memory usage and query latency. + Elasticsearch, by contrast, assigns one thread per shard for most aggregations, regardless of how many CPU cores are available. These threads return shard-local top-N results, which are merged at the coordinating node. This approach can underutilize system resources and introduce potential inaccuracies in global aggregations, particularly when frequent terms are distributed across multiple shards. Accuracy can be improved by increasing the `shard_size` parameter, but this comes at the cost of higher memory usage and query latency. -Elasticsearch execution + Elasticsearch execution -In summary, ClickHouse executes aggregations and queries with finer-grained parallelism and greater control over hardware resources, while Elasticsearch relies on shard-based execution with more rigid constraints. + In summary, ClickHouse executes aggregations and queries with finer-grained parallelism and greater control over hardware resources, while Elasticsearch relies on shard-based execution with more rigid constraints. -For further details on the mechanics of aggregations in the respective technologies, we recommend the blog post ["ClickHouse vs. Elasticsearch: The Mechanics of Count Aggregations"](https://clickhouse.com/blog/clickhouse_vs_elasticsearch_mechanics_of_count_aggregations#elasticsearch). + For further details on the mechanics of aggregations in the respective technologies, we recommend the blog post ["ClickHouse vs. Elasticsearch: The Mechanics of Count Aggregations"](https://clickhouse.com/blog/clickhouse_vs_elasticsearch_mechanics_of_count_aggregations#elasticsearch). ### Data management {#data-management} @@ -205,7 +203,7 @@ In **ClickHouse Cloud**, this becomes even more seamless: all data is stored on In Elasticsearch, **rollups** or **aggregates** are achieved using a mechanism called [**transforms**](https://www.elastic.co/guide/en/elasticsearch/reference/current/transforms.html). These are used to summarize time-series data at fixed intervals (e.g., hourly or daily) using a **sliding window** model. These are configured as recurring background jobs that aggregate data from one index and write the results to a separate **rollup index**. This helps reduce the cost of long-range queries by avoiding repeated scans of high-cardinality raw data. -The following diagram sketches abstractly how transforms work (note that we use the blue color for all documents belonging to the same bucket for which we want to pre-calculate aggregate values): +The following diagram sketches abstractly how transforms work (note that we use the blue color for all documents belonging to the same bucket for which we want to pre-calculate aggregate values): Elasticsearch transforms @@ -215,15 +213,15 @@ ClickHouse takes a fundamentally different approach. Rather than re-aggregating This model is made possible by ClickHouse's support for [**partial aggregate states**](https://clickhouse.com/docs/en/sql-reference/data-types/aggregatefunction) — intermediate representations of aggregation functions that can be stored and later merged. This allows users to maintain partially aggregated results that are fast to query and cheap to update. Since the aggregation happens as data arrives, there's no need to run expensive recurring jobs or re-summarize older data. -We sketch the mechanics of incremental materialized views abstractly (note that we use the blue color for all rows belonging to the same group for which we want to pre-calculate aggregate values): +We sketch the mechanics of incremental materialized views abstractly (note that we use the blue color for all rows belonging to the same group for which we want to pre-calculate aggregate values): ClickHouse Materialized Views -In the diagram above, the materialized view's source table already contains a data part storing some `blue` rows (1 to 10) belonging to the same group. For this group, there also already exists a data part in the view's target table storing a [partial aggregation state](https://www.youtube.com/watch?v=QDAJTKZT8y4) for the `blue` group. When ① ② ③ inserts into the source table with new rows take place, a corresponding source table data part is created for each insert, and, in parallel, (just) for each block of newly inserted rows, a partial aggregation state is calculated and inserted in the form of a data part into the materialized view's target table. ④ During background part merges, the partial aggregation states are merged, resulting in incremental data aggregation. +In the diagram above, the materialized view's source table already contains a data part storing some `blue` rows (1 to 10) belonging to the same group. For this group, there also already exists a data part in the view's target table storing a [partial aggregation state](https://www.youtube.com/watch?v=QDAJTKZT8y4) for the `blue` group. When ① ② ③ inserts into the source table with new rows take place, a corresponding source table data part is created for each insert, and, in parallel, (just) for each block of newly inserted rows, a partial aggregation state is calculated and inserted in the form of a data part into the materialized view's target table. ④ During background part merges, the partial aggregation states are merged, resulting in incremental data aggregation. -Note that all [aggregate functions](https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference) (over 90 of them), including their combinations with aggregate function [combinators](https://www.youtube.com/watch?v=7ApwD0cfAFI), support [partial aggregation states](https://clickhouse.com/docs/en/sql-reference/data-types/aggregatefunction). +Note that all [aggregate functions](https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference) (over 90 of them), including their combinations with aggregate function [combinators](https://www.youtube.com/watch?v=7ApwD0cfAFI), support [partial aggregation states](https://clickhouse.com/docs/en/sql-reference/data-types/aggregatefunction). -For a more concrete example of Elasticsearch vs ClickHouse for incremental aggregates, see this [example](https://github.com/ClickHouse/examples/tree/main/blog-examples/clickhouse-vs-elasticsearch/continuous-data-transformation#continuous-data-transformation-example). +For a more concrete example of Elasticsearch vs ClickHouse for incremental aggregates, see this [example](https://github.com/ClickHouse/examples/tree/main/blog-examples/clickhouse-vs-elasticsearch/continuous-data-transformation#continuous-data-transformation-example). The advantages of ClickHouse's approach include: @@ -233,7 +231,7 @@ The advantages of ClickHouse's approach include: - **Composable**: materialized views can be layered or joined with other views and tables for more complex query acceleration strategies. - **Different TTLs**: different TTL settings can be applied to the source table and target table of the materialized view. -This model is particularly powerful for observability use cases where users need to compute metrics such as per-minute error rates, latencies, or top-N breakdowns without scanning billions of raw records per query. + This model is particularly powerful for observability use cases where users need to compute metrics such as per-minute error rates, latencies, or top-N breakdowns without scanning billions of raw records per query. ### Lakehouse support {#lakehouse-support} @@ -249,4 +247,4 @@ ClickHouse's lakehouse capabilities extend beyond just reading data: - **Incremental loading**: support for continuous loading from lakehouse tables into local [MergeTree](/engines/table-engines/mergetree-family/mergetree) tables, using features like [S3Queue](/engines/table-engines/integrations/s3queue) and [ClickPipes](/integrations/clickpipes). - **Performance optimization**: distributed query execution over lakehouse data using [cluster functions](/sql-reference/table-functions/cluster) for improved performance. -These capabilities make ClickHouse a natural fit for organizations adopting lakehouse architectures, allowing them to leverage both the flexibility of data lakes and the performance of a columnar database. + These capabilities make ClickHouse a natural fit for organizations adopting lakehouse architectures, allowing them to leverage both the flexibility of data lakes and the performance of a columnar database. diff --git a/docs/use-cases/observability/clickstack/migration/elastic/index.md b/docs/use-cases/observability/clickstack/migration/elastic/index.md index 91b64208a29..d1d6ae8242c 100644 --- a/docs/use-cases/observability/clickstack/migration/elastic/index.md +++ b/docs/use-cases/observability/clickstack/migration/elastic/index.md @@ -8,7 +8,7 @@ show_related_blogs: true keywords: ['Elasticsearch'] --- -This guide provides a comprehensive approach to migrating from Elastic Stack to ClickStack. We focus on a parallel operation strategy that minimizes risk while leveraging ClickHouse's strengths in observability workloads. +This guide provides a comprehensive approach to migrating from Elastic Stack to ClickStack. We focus on a parallel operation strategy that minimizes risk while leveraging ClickHouse's strengths in observability workloads. | Section | Description | |---------|-------------| @@ -19,4 +19,3 @@ This guide provides a comprehensive approach to migrating from Elastic Stack to | [Migrating Data](/use-cases/observability/clickstack/migration/elastic/migrating-data) | Strategies for data migration and parallel operation | | [Migrating Agents](/use-cases/observability/clickstack/migration/elastic/migrating-agents) | Transitioning from Elastic agents to OpenTelemetry | | [Migrating SDKs](/use-cases/observability/clickstack/migration/elastic/migrating-sdks) | Replacing Elastic APM agents with OpenTelemetry SDKs | - diff --git a/docs/use-cases/observability/clickstack/migration/elastic/intro.md b/docs/use-cases/observability/clickstack/migration/elastic/intro.md index 66bed39785f..223537a4b3c 100644 --- a/docs/use-cases/observability/clickstack/migration/elastic/intro.md +++ b/docs/use-cases/observability/clickstack/migration/elastic/intro.md @@ -24,7 +24,7 @@ You should consider moving to ClickStack if: - You are committed to OpenTelemetry and want to avoid vendor lock-in. - You want to take advantage of the separation of storage and compute in ClickHouse Cloud, enabling virtually unlimited scale — paying only for ingestion compute and object storage during idle periods. -However, ClickStack may not be suitable if: + However, ClickStack may not be suitable if: - You use observability data primarily for security use cases and need a SIEM-focused product. - Universal profiling is a critical part of your workflow. diff --git a/docs/use-cases/observability/clickstack/migration/elastic/migrating-agents.md b/docs/use-cases/observability/clickstack/migration/elastic/migrating-agents.md index 2afbc1b6d56..c1074427c15 100644 --- a/docs/use-cases/observability/clickstack/migration/elastic/migrating-agents.md +++ b/docs/use-cases/observability/clickstack/migration/elastic/migrating-agents.md @@ -24,7 +24,7 @@ The Elastic Stack provides a number of Observability data collection agents. Spe - The [`Elastic Agent`](https://www.elastic.co/elastic-agent) provides a unified agent capable of collecting logs, metrics, and traces. This agent can be centrally managed via the [Elastic Fleet Server](https://www.elastic.co/docs/reference/fleet/manage-elastic-agents-in-fleet) and supports output to Elasticsearch, Logstash, Kafka, or Redis. - Elastic also provides a distribution of the [OpenTelemetry Collector - EDOT](https://www.elastic.co/docs/reference/opentelemetry). While it currently cannot be orchestrated by the Fleet Server, it offers a more flexible and open path for users migrating to ClickStack. -The best migration path depends on the agent(s) currently in use. In the sections that follow, we document migration options for each major agent type. Our goal is to minimize friction and, where possible, allow users to continue using their existing agents during the transition. + The best migration path depends on the agent(s) currently in use. In the sections that follow, we document migration options for each major agent type. Our goal is to minimize friction and, where possible, allow users to continue using their existing agents during the transition. ## Preferred migration path {#prefered-migration-path} @@ -50,7 +50,7 @@ Users with extensive Beat deployments may wish to retain these when migrating to Beats agents use the [Elastic Common Schema (ECS)](https://www.elastic.co/docs/reference/ecs), which is currently [in the process of being merged into the OpenTelemetry](https://github.com/open-telemetry/opentelemetry-specification/blob/main/oteps/0199-support-elastic-common-schema-in-opentelemetry.md) specification used by ClickStack. However, these [schemas still differ significantly](https://www.elastic.co/docs/reference/ecs/ecs-otel-alignment-overview), and users are currently responsible for transforming ECS-formatted events into OpenTelemetry format before ingestion into ClickStack. -We recommend performing this transformation using [Vector](https://vector.dev), a lightweight and high-performance observability data pipeline that supports a powerful transformation language called Vector Remap Language (VRL). +We recommend performing this transformation using [Vector](https://vector.dev), a lightweight and high-performance observability data pipeline that supports a powerful transformation language called Vector Remap Language (VRL). If your Filebeat agents are configured to send data to Kafka - a supported output by Beats - Vector can consume those events from Kafka, apply schema transformations using VRL, and then forward them via OTLP to the OpenTelemetry Collector distributed with ClickStack. @@ -63,257 +63,215 @@ We illustrate both of these architectures below. In the following example, we provide the initial steps to configure Vector to receive log events from Filebeat via the Lumberjack protocol. We provide VRL for mapping the inbound ECS events to OTel specification, before sending these to the ClickStack OpenTelemetry collector via OTLP. Users consuming events from Kafka can replace the Vector Logstash source with the [Kafka source](https://vector.dev/docs/reference/configuration/sources/kafka/) - all other steps remain the same. - ### Install vector {#install-vector} - Install Vector using the [official installation guide](https://vector.dev/docs/setup/installation/). - This can be installed on the same instance as your Elastic Stack OTel collector. - Users can follow best practices with regards to architecture and security when [moving Vector to production](https://vector.dev/docs/setup/going-to-prod/). - ### Configure vector {#configure-vector} - Vector should be configured to receive events over the Lumberjack protocol, imitating a Logstash instance. This can be achieved by configuring a [`logstash` source](https://vector.dev/docs/reference/configuration/sources/logstash/) for Vector: - ```yaml sources: - beats: - type: logstash - address: 0.0.0.0:5044 - tls: - enabled: false # Set to true if you're using TLS - # The files below are generated from the steps at https://www.elastic.co/docs/reference/fleet/secure-logstash-connections#generate-logstash-certs - # crt_file: logstash.crt - # key_file: logstash.key - # ca_file: ca.crt - # verify_certificate: true +beats: +type: logstash +address: 0.0.0.0:5044 +tls: +enabled: false # Set to true if you're using TLS +# The files below are generated from the steps at https://www.elastic.co/docs/reference/fleet/secure-logstash-connections#generate-logstash-certs +# crt_file: logstash.crt +# key_file: logstash.key +# ca_file: ca.crt +# verify_certificate: true ``` - :::note TLS configuration If Mutual TLS is required, generate certificates and keys using the Elastic guide ["Configure SSL/TLS for the Logstash output"](https://www.elastic.co/docs/reference/fleet/secure-logstash-connections#use-ls-output). These can then be specified in the configuration as shown above. ::: - - Events will be received in ECS format. These can be converted to the OpenTelemetry schema using a Vector Remap Language (VRL) transformer. Configuration of this transformer is simple - with the script file held in a separate file: - ```yaml transforms: - remap_filebeat: - inputs: ["beats"] - type: "remap" - file: 'beat_to_otel.vrl' +remap_filebeat: +inputs: ["beats"] +type: "remap" +file: 'beat_to_otel.vrl' ``` - Note it receives events from the above `beats` source. Our remap script is shown below. This script has been tested with log events only but can form the basis for other formats. -
    VRL - ECS to OTel - ```javascript # Define keys to ignore at root level ignored_keys = ["@metadata"] - # Define resource key prefixes resource_keys = ["host", "cloud", "agent", "service"] - # Create separate objects for resource and log record fields resource_obj = {} log_record_obj = {} - # Copy all non-ignored root keys to appropriate objects root_keys = keys(.) for_each(root_keys) -> |_index, key| { - if !includes(ignored_keys, key) { - val, err = get(., [key]) - if err == null { - # Check if this is a resource field - is_resource = false - if includes(resource_keys, key) { - is_resource = true - } - - # Add to appropriate object - if is_resource { - resource_obj = set(resource_obj, [key], val) ?? resource_obj - } else { - log_record_obj = set(log_record_obj, [key], val) ?? log_record_obj - } - } - } +if !includes(ignored_keys, key) { +val, err = get(., [key]) +if err == null { +# Check if this is a resource field +is_resource = false +if includes(resource_keys, key) { +is_resource = true +} +# Add to appropriate object +if is_resource { +resource_obj = set(resource_obj, [key], val) ?? resource_obj +} else { +log_record_obj = set(log_record_obj, [key], val) ?? log_record_obj +} +} +} } - # Flatten both objects separately flattened_resources = flatten(resource_obj, separator: ".") flattened_logs = flatten(log_record_obj, separator: ".") - # Process resource attributes resource_attributes = [] resource_keys_list = keys(flattened_resources) for_each(resource_keys_list) -> |_index, field_key| { - field_value, err = get(flattened_resources, [field_key]) - if err == null && field_value != null { - attribute, err = { - "key": field_key, - "value": { - "stringValue": to_string(field_value) - } - } - if (err == null) { - resource_attributes = push(resource_attributes, attribute) - } - } +field_value, err = get(flattened_resources, [field_key]) +if err == null && field_value != null { +attribute, err = { +"key": field_key, +"value": { +"stringValue": to_string(field_value) +} +} +if (err == null) { +resource_attributes = push(resource_attributes, attribute) +} +} } - # Process log record attributes log_attributes = [] log_keys_list = keys(flattened_logs) for_each(log_keys_list) -> |_index, field_key| { - field_value, err = get(flattened_logs, [field_key]) - if err == null && field_value != null { - attribute, err = { - "key": field_key, - "value": { - "stringValue": to_string(field_value) - } - } - if (err == null) { - log_attributes = push(log_attributes, attribute) - } - } +field_value, err = get(flattened_logs, [field_key]) +if err == null && field_value != null { +attribute, err = { +"key": field_key, +"value": { +"stringValue": to_string(field_value) +} +} +if (err == null) { +log_attributes = push(log_attributes, attribute) +} +} } - # Get timestamp for timeUnixNano (convert to nanoseconds) timestamp_nano = if exists(.@timestamp) { - to_unix_timestamp!(parse_timestamp!(.@timestamp, format: "%Y-%m-%dT%H:%M:%S%.3fZ"), unit: "nanoseconds") +to_unix_timestamp!(parse_timestamp!(.@timestamp, format: "%Y-%m-%dT%H:%M:%S%.3fZ"), unit: "nanoseconds") } else { - to_unix_timestamp(now(), unit: "nanoseconds") +to_unix_timestamp(now(), unit: "nanoseconds") } - # Get message/body field body_value = if exists(.message) { - to_string!(.message) +to_string!(.message) } else if exists(.body) { - to_string!(.body) +to_string!(.body) } else { - "" +"" } - # Create the OpenTelemetry structure . = { - "resourceLogs": [ - { - "resource": { - "attributes": resource_attributes - }, - "scopeLogs": [ - { - "scope": {}, - "logRecords": [ - { - "timeUnixNano": to_string(timestamp_nano), - "severityNumber": 9, - "severityText": "info", - "body": { - "stringValue": body_value - }, - "attributes": log_attributes - } - ] - } - ] - } - ] +"resourceLogs": [ +{ +"resource": { +"attributes": resource_attributes +}, +"scopeLogs": [ +{ +"scope": {}, +"logRecords": [ +{ +"timeUnixNano": to_string(timestamp_nano), +"severityNumber": 9, +"severityText": "info", +"body": { +"stringValue": body_value +}, +"attributes": log_attributes +} +] +} +] +} +] } ``` -
    - Finally, transformed events can be sent to ClickStack via OpenTelemetry collector over OTLP. This requires the configuration of a OTLP sink in Vector, which takes events from the `remap_filebeat` transform as input: - ```yaml sinks: - otlp: - type: opentelemetry - inputs: [remap_filebeat] # receives events from a remap transform - see below - protocol: - type: http # Use "grpc" for port 4317 - uri: http://localhost:4318/v1/logs # logs endpoint for the OTel collector - method: post - encoding: - codec: json - framing: - method: newline_delimited - headers: - content-type: application/json - authorization: ${YOUR_INGESTION_API_KEY} +otlp: +type: opentelemetry +inputs: [remap_filebeat] # receives events from a remap transform - see below +protocol: +type: http # Use "grpc" for port 4317 +uri: http://localhost:4318/v1/logs # logs endpoint for the OTel collector +method: post +encoding: +codec: json +framing: +method: newline_delimited +headers: +content-type: application/json +authorization: ${YOUR_INGESTION_API_KEY} ``` - The `YOUR_INGESTION_API_KEY` here is produced by ClickStack. You can find the key in the HyperDX app under `Team Settings → API Keys`. - Ingestion keys - Our final complete configuration is shown below: - ```yaml sources: - beats: - type: logstash - address: 0.0.0.0:5044 - tls: - enabled: false # Set to true if you're using TLS - #crt_file: /data/elasticsearch-9.0.1/logstash/logstash.crt - #key_file: /data/elasticsearch-9.0.1/logstash/logstash.key - #ca_file: /data/elasticsearch-9.0.1/ca/ca.crt - #verify_certificate: true - - +beats: +type: logstash +address: 0.0.0.0:5044 +tls: +enabled: false # Set to true if you're using TLS +#crt_file: /data/elasticsearch-9.0.1/logstash/logstash.crt +#key_file: /data/elasticsearch-9.0.1/logstash/logstash.key +#ca_file: /data/elasticsearch-9.0.1/ca/ca.crt +#verify_certificate: true transforms: - remap_filebeat: - inputs: ["beats"] - type: "remap" - file: 'beat_to_otel.vrl' - +remap_filebeat: +inputs: ["beats"] +type: "remap" +file: 'beat_to_otel.vrl' sinks: - otlp: - type: opentelemetry - inputs: [remap_filebeat] - protocol: - type: http # Use "grpc" for port 4317 - uri: http://localhost:4318/v1/logs - method: post - encoding: - codec: json - framing: - method: newline_delimited - headers: - content-type: application/json +otlp: +type: opentelemetry +inputs: [remap_filebeat] +protocol: +type: http # Use "grpc" for port 4317 +uri: http://localhost:4318/v1/logs +method: post +encoding: +codec: json +framing: +method: newline_delimited +headers: +content-type: application/json ``` - ### Configure Filebeat {#configure-filebeat} - Existing Filebeat installations simply need to be modified to send their events to Vector. This requires the configuration of a Logstash output - again, TLS can be optionally configured: - ```yaml # ------------------------------ Logstash Output ------------------------------- output.logstash: - # The Logstash hosts - hosts: ["localhost:5044"] - - # Optional SSL. By default is off. - # List of root certificates for HTTPS server verifications - #ssl.certificate_authorities: ["/etc/pki/root/ca.pem"] - - # Certificate for SSL client authentication - #ssl.certificate: "/etc/pki/client/cert.pem" - - # Client Certificate Key - #ssl.key: "/etc/pki/client/cert.key" +# The Logstash hosts +hosts: ["localhost:5044"] +# Optional SSL. By default is off. +# List of root certificates for HTTPS server verifications +#ssl.certificate_authorities: ["/etc/pki/root/ca.pem"] +# Certificate for SSL client authentication +#ssl.certificate: "/etc/pki/client/cert.pem" +# Client Certificate Key +#ssl.key: "/etc/pki/client/cert.key" ``` -
    - ## Migrating from Elastic Agent {#migrating-from-elastic-agent} The Elastic Agent consolidates the different Elastic Beats into a single package. This agent integrates with [Elastic Fleet](https://www.elastic.co/docs/reference/fleet/fleet-server), allowing it to be centrally orchestrated and configured. @@ -323,46 +281,34 @@ Users with Elastic Agents deployed have several migration paths: - Configure the agent to send to a Vector endpoint over the Lumberjack protocol. **This has currently been tested for users collecting log data with the Elastic Agent only.** This can be centrally configured via the Fleet UI in Kibana. - [Run the agent as Elastic OpenTelemetry Collector (EDOT)](https://www.elastic.co/docs/reference/fleet/otel-agent). The Elastic Agent includes an embedded EDOT Collector that allows you to instrument your applications and infrastructure once and send data to multiple vendors and backends. In this configuration, users can simply configure the EDOT collector to forward events to the ClickStack OTel collector over OTLP. **This approach supports all event types.** -We demonstrate both of these options below. + We demonstrate both of these options below. ### Sending data via Vector {#sending-data-via-vector} - #### Install and configure Vector {#install-configure-vector} - Install and configure Vector using the [same steps](#install-vector) as those documented for migrating from Filebeat. - #### Configure Elastic Agent {#configure-elastic-agent} - Elastic Agent needs to be configured to send data via the Logstash protocol Lumberjack. This is a [supported deployment pattern](https://www.elastic.co/docs/manage-data/ingest/ingest-reference-architectures/ls-networkbridge) and can either be configured centrally or [via the agent configuration file `elastic-agent.yaml`](https://www.elastic.co/docs/reference/fleet/logstash-output) if deploying without Fleet. - Central configuration through Kibana can be achieved by adding [an Output to Fleet](https://www.elastic.co/docs/reference/fleet/fleet-settings#output-settings). - Add Logstash output - This output can then be used in an [agent policy](https://www.elastic.co/docs/reference/fleet/agent-policy). This will automatically mean any agents using the policy will send their data to Vector. - Agent settings - Since this requires secure communication over TLS to be configured, we recommend the guide ["Configure SSL/TLS for the Logstash output"](https://www.elastic.co/docs/reference/fleet/secure-logstash-connections#use-ls-output), which can be followed with the user assuming their Vector instance assumes the role of Logstash. - Note that this requires users to configure the Logstash source in Vector to also mutual TLS. Use the keys and certificates [generated in the guide](https://www.elastic.co/docs/reference/fleet/secure-logstash-connections#generate-logstash-certs) to configure the input appropriately. - ```yaml sources: - beats: - type: logstash - address: 0.0.0.0:5044 - tls: - enabled: true # Set to true if you're using TLS. - # The files below are generated from the steps at https://www.elastic.co/docs/reference/fleet/secure-logstash-connections#generate-logstash-certs - crt_file: logstash.crt - key_file: logstash.key - ca_file: ca.crt - verify_certificate: true +beats: +type: logstash +address: 0.0.0.0:5044 +tls: +enabled: true # Set to true if you're using TLS. +# The files below are generated from the steps at https://www.elastic.co/docs/reference/fleet/secure-logstash-connections#generate-logstash-certs +crt_file: logstash.crt +key_file: logstash.key +ca_file: ca.crt +verify_certificate: true ``` - ### Run Elastic Agent as OpenTelemetry collector {#run-agent-as-otel} @@ -375,7 +321,6 @@ Users running the EDOT collector distributed with Elastic Agent will not be able To run the Elastic Agent with the EDOT collector, see the [official Elastic guide](https://www.elastic.co/docs/reference/fleet/otel-agent-transform). Rather than configuring the Elastic endpoint, as indicated in the guide, remove existing `exporters` and configure the OTLP output - sending data to the ClickStack OpenTelemetry collector. For example, the configuration for the exporters becomes: - ```yaml exporters: # Exporter to send logs and metrics to Elasticsearch Managed OTLP Input diff --git a/docs/use-cases/observability/clickstack/migration/elastic/migrating-data.md b/docs/use-cases/observability/clickstack/migration/elastic/migrating-data.md index 213b3a37f54..533bb27c9a6 100644 --- a/docs/use-cases/observability/clickstack/migration/elastic/migrating-data.md +++ b/docs/use-cases/observability/clickstack/migration/elastic/migrating-data.md @@ -17,32 +17,27 @@ When migrating from Elastic to ClickStack for observability use cases, we recomm 1. **Minimal risk**: by running both systems concurrently, you maintain access to existing data and dashboards while validating ClickStack and familiarizing your users with the new system. 2. **Natural data expiration**: most observability data has a limited retention period (typically 30 days or less), allowing for a natural transition as data expires from Elastic. 3. **Simplified migration**: no need for complex data transfer tools or processes to move historical data between systems. -
    -:::note Migrating data -We demonstrate an approach for migrating essential data from Elasticsearch to ClickHouse in the section ["Migrating data"](#migrating-data). This should not be used for larger datasets as it is rarely performant - limited by the ability for Elasticsearch to export efficiently, with only JSON format supported. -::: + :::note Migrating data + We demonstrate an approach for migrating essential data from Elasticsearch to ClickHouse in the section ["Migrating data"](#migrating-data). This should not be used for larger datasets as it is rarely performant - limited by the ability for Elasticsearch to export efficiently, with only JSON format supported. + ::: ### Implementation steps {#implementation-steps} 1. **Configure Dual Ingestion** -
    -Set up your data collection pipeline to send data to both Elastic and ClickStack simultaneously. + Set up your data collection pipeline to send data to both Elastic and ClickStack simultaneously. -How this is achieved depends on your current agents for collection - see ["Migrating Agents"](/use-cases/observability/clickstack/migration/elastic/migrating-agents). + How this is achieved depends on your current agents for collection - see ["Migrating Agents"](/use-cases/observability/clickstack/migration/elastic/migrating-agents). 2. **Adjust Retention Periods** -
    -Configure Elastic's TTL settings to match your desired retention period. Set up the ClickStack [TTL](/use-cases/observability/clickstack/production#configure-ttl) to maintain data for the same duration. + Configure Elastic's TTL settings to match your desired retention period. Set up the ClickStack [TTL](/use-cases/observability/clickstack/production#configure-ttl) to maintain data for the same duration. 3. **Validate and Compare**: -
    - Run queries against both systems to ensure data consistency - Compare query performance and results - Migrate dashboards and alerts to ClickStack. This is currently a manual process. - Verify that all critical dashboards and alerts work as expected in ClickStack 4. **Gradual Transition**: -
    - As data naturally expires from Elastic, users will increasingly rely on ClickStack - Once confidence in ClickStack is established, you can begin redirecting queries and dashboards @@ -72,11 +67,11 @@ We recommend starting with a **single shard** and scaling vertically. This confi - **[ClickHouse Cloud](https://clickhouse.com/cloud)**: Uses a single-shard, multi-replica architecture by default. Storage and compute scale independently, making it ideal for observability use cases with unpredictable ingest patterns and read-heavy workloads. - **ClickHouse OSS**: In self-managed deployments, we recommend: - - Starting with a single shard - - Scaling vertically with additional CPU and RAM - - Using [tiered storage](/observability/managing-data#storage-tiers) to extend local disk with S3-compatible object storage - - Using [`ReplicatedMergeTree`](/engines/table-engines/mergetree-family/replication) if high availability is required - - For fault tolerance, [1 replica of your shard](/engines/table-engines/mergetree-family/replication) is typically sufficient in Observability workloads. + - Starting with a single shard + - Scaling vertically with additional CPU and RAM + - Using [tiered storage](/observability/managing-data#storage-tiers) to extend local disk with S3-compatible object storage + - Using [`ReplicatedMergeTree`](/engines/table-engines/mergetree-family/replication) if high availability is required + - For fault tolerance, [1 replica of your shard](/engines/table-engines/mergetree-family/replication) is typically sufficient in Observability workloads. ### When to shard {#when-to-shard} @@ -86,7 +81,7 @@ Sharding may be necessary if: - You need tenant isolation or regional data separation - Your total dataset is too large for a single server, even with object storage -If you do need to shard, refer to [Horizontal scaling](/architecture/horizontal-scaling) for guidance on shard keys and distributed table setup. + If you do need to shard, refer to [Horizontal scaling](/architecture/horizontal-scaling) for guidance on shard keys and distributed table setup. ### Retention and TTL {#retention-and-ttl} @@ -96,7 +91,7 @@ ClickHouse uses [TTL clauses](/use-cases/observability/clickstack/production#con - Move older data to cold object storage - Retain only recent, frequently queried logs on fast disk -We recommend aligning your ClickHouse TTL configuration with your existing Elastic retention policies to maintain a consistent data lifecycle during the migration. For examples, see [ClickStack production TTL setup](/use-cases/observability/clickstack/production#configure-ttl). + We recommend aligning your ClickHouse TTL configuration with your existing Elastic retention policies to maintain a consistent data lifecycle during the migration. For examples, see [ClickStack production TTL setup](/use-cases/observability/clickstack/production#configure-ttl). ## Migrating data {#migrating-data} @@ -106,475 +101,442 @@ While we recommend parallel operation for most observability data, there are spe - Business data stored in Elasticsearch that needs to be correlated with observability data, with ClickHouse's SQL capabilities and Business Intelligence integrations making it easier to maintain and query the data compared to Elasticsearch's more limited query options. - Configuration data that needs to be preserved across the migration -This approach is only viable for datasets under 10 million rows, as Elasticsearch's export capabilities are limited to JSON over HTTP and don't scale well for larger datasets. + This approach is only viable for datasets under 10 million rows, as Elasticsearch's export capabilities are limited to JSON over HTTP and don't scale well for larger datasets. -The following steps allow the migration of a single Elasticsearch index from ClickHouse. + The following steps allow the migration of a single Elasticsearch index from ClickHouse. - ### Migrate schema {#migrate-scheme} - Create a table in ClickHouse for the index being migrated from Elasticsearch. Users can map [Elasticsearch types to their ClickHouse](/use-cases/observability/clickstack/migration/elastic/types) equivalent. Alternatively, users can simply rely on the JSON data type in ClickHouse, which will dynamically create columns of the appropriate type as data is inserted. - Consider the following Elasticsearch mapping for an index containing `syslog` data: -
    Elasticsearch mapping - ```javascripton GET .ds-logs-system.syslog-default-2025.06.03-000001/_mapping { - ".ds-logs-system.syslog-default-2025.06.03-000001": { - "mappings": { - "_meta": { - "managed_by": "fleet", - "managed": true, - "package": { - "name": "system" - } - }, - "_data_stream_timestamp": { - "enabled": true - }, - "dynamic_templates": [], - "date_detection": false, - "properties": { - "@timestamp": { - "type": "date", - "ignore_malformed": false - }, - "agent": { - "properties": { - "ephemeral_id": { - "type": "keyword", - "ignore_above": 1024 - }, - "id": { - "type": "keyword", - "ignore_above": 1024 - }, - "name": { - "type": "keyword", - "fields": { - "text": { - "type": "match_only_text" - } - } - }, - "type": { - "type": "keyword", - "ignore_above": 1024 - }, - "version": { - "type": "keyword", - "ignore_above": 1024 - } - } - }, - "cloud": { - "properties": { - "account": { - "properties": { - "id": { - "type": "keyword", - "ignore_above": 1024 - } - } - }, - "availability_zone": { - "type": "keyword", - "ignore_above": 1024 - }, - "image": { - "properties": { - "id": { - "type": "keyword", - "ignore_above": 1024 - } - } - }, - "instance": { - "properties": { - "id": { - "type": "keyword", - "ignore_above": 1024 - } - } - }, - "machine": { - "properties": { - "type": { - "type": "keyword", - "ignore_above": 1024 - } - } - }, - "provider": { - "type": "keyword", - "ignore_above": 1024 - }, - "region": { - "type": "keyword", - "ignore_above": 1024 - }, - "service": { - "properties": { - "name": { - "type": "keyword", - "fields": { - "text": { - "type": "match_only_text" - } - } - } - } - } - } - }, - "data_stream": { - "properties": { - "dataset": { - "type": "constant_keyword", - "value": "system.syslog" - }, - "namespace": { - "type": "constant_keyword", - "value": "default" - }, - "type": { - "type": "constant_keyword", - "value": "logs" - } - } - }, - "ecs": { - "properties": { - "version": { - "type": "keyword", - "ignore_above": 1024 - } - } - }, - "elastic_agent": { - "properties": { - "id": { - "type": "keyword", - "ignore_above": 1024 - }, - "snapshot": { - "type": "boolean" - }, - "version": { - "type": "keyword", - "ignore_above": 1024 - } - } - }, - "event": { - "properties": { - "agent_id_status": { - "type": "keyword", - "ignore_above": 1024 - }, - "dataset": { - "type": "constant_keyword", - "value": "system.syslog" - }, - "ingested": { - "type": "date", - "format": "strict_date_time_no_millis||strict_date_optional_time||epoch_millis", - "ignore_malformed": false - }, - "module": { - "type": "constant_keyword", - "value": "system" - }, - "timezone": { - "type": "keyword", - "ignore_above": 1024 - } - } - }, - "host": { - "properties": { - "architecture": { - "type": "keyword", - "ignore_above": 1024 - }, - "containerized": { - "type": "boolean" - }, - "hostname": { - "type": "keyword", - "ignore_above": 1024 - }, - "id": { - "type": "keyword", - "ignore_above": 1024 - }, - "ip": { - "type": "ip" - }, - "mac": { - "type": "keyword", - "ignore_above": 1024 - }, - "name": { - "type": "keyword", - "ignore_above": 1024 - }, - "os": { - "properties": { - "build": { - "type": "keyword", - "ignore_above": 1024 - }, - "codename": { - "type": "keyword", - "ignore_above": 1024 - }, - "family": { - "type": "keyword", - "ignore_above": 1024 - }, - "kernel": { - "type": "keyword", - "ignore_above": 1024 - }, - "name": { - "type": "keyword", - "fields": { - "text": { - "type": "match_only_text" - } - } - }, - "platform": { - "type": "keyword", - "ignore_above": 1024 - }, - "type": { - "type": "keyword", - "ignore_above": 1024 - }, - "version": { - "type": "keyword", - "ignore_above": 1024 - } - } - } - } - }, - "input": { - "properties": { - "type": { - "type": "keyword", - "ignore_above": 1024 - } - } - }, - "log": { - "properties": { - "file": { - "properties": { - "path": { - "type": "keyword", - "fields": { - "text": { - "type": "match_only_text" - } - } - } - } - }, - "offset": { - "type": "long" - } - } - }, - "message": { - "type": "match_only_text" - }, - "process": { - "properties": { - "name": { - "type": "keyword", - "fields": { - "text": { - "type": "match_only_text" - } - } - }, - "pid": { - "type": "long" - } - } - }, - "system": { - "properties": { - "syslog": { - "type": "object" - } - } - } - } - } - } +".ds-logs-system.syslog-default-2025.06.03-000001": { +"mappings": { +"_meta": { +"managed_by": "fleet", +"managed": true, +"package": { +"name": "system" +} +}, +"_data_stream_timestamp": { +"enabled": true +}, +"dynamic_templates": [], +"date_detection": false, +"properties": { +"@timestamp": { +"type": "date", +"ignore_malformed": false +}, +"agent": { +"properties": { +"ephemeral_id": { +"type": "keyword", +"ignore_above": 1024 +}, +"id": { +"type": "keyword", +"ignore_above": 1024 +}, +"name": { +"type": "keyword", +"fields": { +"text": { +"type": "match_only_text" +} +} +}, +"type": { +"type": "keyword", +"ignore_above": 1024 +}, +"version": { +"type": "keyword", +"ignore_above": 1024 +} +} +}, +"cloud": { +"properties": { +"account": { +"properties": { +"id": { +"type": "keyword", +"ignore_above": 1024 +} +} +}, +"availability_zone": { +"type": "keyword", +"ignore_above": 1024 +}, +"image": { +"properties": { +"id": { +"type": "keyword", +"ignore_above": 1024 +} +} +}, +"instance": { +"properties": { +"id": { +"type": "keyword", +"ignore_above": 1024 +} +} +}, +"machine": { +"properties": { +"type": { +"type": "keyword", +"ignore_above": 1024 +} +} +}, +"provider": { +"type": "keyword", +"ignore_above": 1024 +}, +"region": { +"type": "keyword", +"ignore_above": 1024 +}, +"service": { +"properties": { +"name": { +"type": "keyword", +"fields": { +"text": { +"type": "match_only_text" +} +} +} +} +} +} +}, +"data_stream": { +"properties": { +"dataset": { +"type": "constant_keyword", +"value": "system.syslog" +}, +"namespace": { +"type": "constant_keyword", +"value": "default" +}, +"type": { +"type": "constant_keyword", +"value": "logs" +} +} +}, +"ecs": { +"properties": { +"version": { +"type": "keyword", +"ignore_above": 1024 +} +} +}, +"elastic_agent": { +"properties": { +"id": { +"type": "keyword", +"ignore_above": 1024 +}, +"snapshot": { +"type": "boolean" +}, +"version": { +"type": "keyword", +"ignore_above": 1024 +} +} +}, +"event": { +"properties": { +"agent_id_status": { +"type": "keyword", +"ignore_above": 1024 +}, +"dataset": { +"type": "constant_keyword", +"value": "system.syslog" +}, +"ingested": { +"type": "date", +"format": "strict_date_time_no_millis||strict_date_optional_time||epoch_millis", +"ignore_malformed": false +}, +"module": { +"type": "constant_keyword", +"value": "system" +}, +"timezone": { +"type": "keyword", +"ignore_above": 1024 +} +} +}, +"host": { +"properties": { +"architecture": { +"type": "keyword", +"ignore_above": 1024 +}, +"containerized": { +"type": "boolean" +}, +"hostname": { +"type": "keyword", +"ignore_above": 1024 +}, +"id": { +"type": "keyword", +"ignore_above": 1024 +}, +"ip": { +"type": "ip" +}, +"mac": { +"type": "keyword", +"ignore_above": 1024 +}, +"name": { +"type": "keyword", +"ignore_above": 1024 +}, +"os": { +"properties": { +"build": { +"type": "keyword", +"ignore_above": 1024 +}, +"codename": { +"type": "keyword", +"ignore_above": 1024 +}, +"family": { +"type": "keyword", +"ignore_above": 1024 +}, +"kernel": { +"type": "keyword", +"ignore_above": 1024 +}, +"name": { +"type": "keyword", +"fields": { +"text": { +"type": "match_only_text" +} +} +}, +"platform": { +"type": "keyword", +"ignore_above": 1024 +}, +"type": { +"type": "keyword", +"ignore_above": 1024 +}, +"version": { +"type": "keyword", +"ignore_above": 1024 +} +} +} +} +}, +"input": { +"properties": { +"type": { +"type": "keyword", +"ignore_above": 1024 +} +} +}, +"log": { +"properties": { +"file": { +"properties": { +"path": { +"type": "keyword", +"fields": { +"text": { +"type": "match_only_text" +} +} +} +} +}, +"offset": { +"type": "long" +} +} +}, +"message": { +"type": "match_only_text" +}, +"process": { +"properties": { +"name": { +"type": "keyword", +"fields": { +"text": { +"type": "match_only_text" +} +} +}, +"pid": { +"type": "long" +} +} +}, +"system": { +"properties": { +"syslog": { +"type": "object" +} +} +} +} +} +} } ```
    - - The equivalent ClickHouse table schema: -
    ClickHouse schema - ```sql SET enable_json_type = 1; - CREATE TABLE logs_system_syslog ( - `@timestamp` DateTime, - `agent` Tuple( - ephemeral_id String, - id String, - name String, - type String, - version String), - `cloud` Tuple( - account Tuple( - id String), - availability_zone String, - image Tuple( - id String), - instance Tuple( - id String), - machine Tuple( - type String), - provider String, - region String, - service Tuple( - name String)), - `data_stream` Tuple( - dataset String, - namespace String, - type String), - `ecs` Tuple( - version String), - `elastic_agent` Tuple( - id String, - snapshot UInt8, - version String), - `event` Tuple( - agent_id_status String, - dataset String, - ingested DateTime, - module String, - timezone String), - `host` Tuple( - architecture String, - containerized UInt8, - hostname String, - id String, - ip Array(Variant(IPv4, IPv6)), - mac Array(String), - name String, - os Tuple( - build String, - codename String, - family String, - kernel String, - name String, - platform String, - type String, - version String)), - `input` Tuple( - type String), - `log` Tuple( - file Tuple( - path String), - offset Int64), - `message` String, - `process` Tuple( - name String, - pid Int64), - `system` Tuple( - syslog JSON) +`@timestamp` DateTime, +`agent` Tuple( +ephemeral_id String, +id String, +name String, +type String, +version String), +`cloud` Tuple( +account Tuple( +id String), +availability_zone String, +image Tuple( +id String), +instance Tuple( +id String), +machine Tuple( +type String), +provider String, +region String, +service Tuple( +name String)), +`data_stream` Tuple( +dataset String, +namespace String, +type String), +`ecs` Tuple( +version String), +`elastic_agent` Tuple( +id String, +snapshot UInt8, +version String), +`event` Tuple( +agent_id_status String, +dataset String, +ingested DateTime, +module String, +timezone String), +`host` Tuple( +architecture String, +containerized UInt8, +hostname String, +id String, +ip Array(Variant(IPv4, IPv6)), +mac Array(String), +name String, +os Tuple( +build String, +codename String, +family String, +kernel String, +name String, +platform String, +type String, +version String)), +`input` Tuple( +type String), +`log` Tuple( +file Tuple( +path String), +offset Int64), +`message` String, +`process` Tuple( +name String, +pid Int64), +`system` Tuple( +syslog JSON) ) ENGINE = MergeTree ORDER BY (`host.name`, `@timestamp`) ``` -
    - Note that: - - - Tuples are used to represent nested structures instead of dot notation - - Used appropriate ClickHouse types based on the mapping: - - `keyword` → `String` - - `date` → `DateTime` - - `boolean` → `UInt8` - - `long` → `Int64` - - `ip` → `Array(Variant(IPv4, IPv6))`. We use a [`Variant(IPv4, IPv6)`](/sql-reference/data-types/variant) here as the field contains a mixture of [`IPv4`](/sql-reference/data-types/ipv4) and [`IPv6`](/sql-reference/data-types/ipv6). - - `object` → `JSON` for the syslog object whose structure is unpredictable. - - Columns `host.ip` and `host.mac` are explicit `Array` type, unlike in Elasticsearch where all types are arrays. - - An `ORDER BY` clause is added using timestamp and hostname for efficient time-based queries - - `MergeTree`, which is optimal for log data, is used as the engine type - +- Tuples are used to represent nested structures instead of dot notation +- Used appropriate ClickHouse types based on the mapping: +- `keyword` → `String` +- `date` → `DateTime` +- `boolean` → `UInt8` +- `long` → `Int64` +- `ip` → `Array(Variant(IPv4, IPv6))`. We use a [`Variant(IPv4, IPv6)`](/sql-reference/data-types/variant) here as the field contains a mixture of [`IPv4`](/sql-reference/data-types/ipv4) and [`IPv6`](/sql-reference/data-types/ipv6). +- `object` → `JSON` for the syslog object whose structure is unpredictable. +- Columns `host.ip` and `host.mac` are explicit `Array` type, unlike in Elasticsearch where all types are arrays. +- An `ORDER BY` clause is added using timestamp and hostname for efficient time-based queries +- `MergeTree`, which is optimal for log data, is used as the engine type **This approach of statically defining the schema and using the JSON type selectively where required [is recommended](/integrations/data-formats/json/schema#handling-semi-structured-dynamic-structures).** - This strict schema has a number of benefits: - -- **Data validation** – enforcing a strict schema avoids the risk of column explosion, outside of specific structures. +- **Data validation** – enforcing a strict schema avoids the risk of column explosion, outside of specific structures. - **Avoids risk of column explosion**: although the JSON type scales to potentially thousands of columns, where subcolumns are stored as dedicated columns, this can lead to a column file explosion where an excessive number of column files are created that impacts performance. To mitigate this, the underlying [Dynamic type](/sql-reference/data-types/dynamic) used by JSON offers a [`max_dynamic_paths`](/sql-reference/data-types/newjson#reading-json-paths-as-sub-columns) parameter, which limits the number of unique paths stored as separate column files. Once the threshold is reached, additional paths are stored in a shared column file using a compact encoded format, maintaining performance and storage efficiency while supporting flexible data ingestion. Accessing this shared column file is, however, not as performant. Note, however, that the JSON column can be used with [type hints](/integrations/data-formats/json/schema#using-type-hints-and-skipping-paths). "Hinted" columns will deliver the same performance as dedicated columns. - **Simpler introspection of paths and types**: although the JSON type supports [introspection functions](/sql-reference/data-types/newjson#introspection-functions) to determine the types and paths that have been inferred, static structures can be simpler to explore e.g. with `DESCRIBE`. -
    Alternatively, users can simply create a table with one `JSON` column. - ```sql SET enable_json_type = 1; - CREATE TABLE syslog_json ( - `json` JSON(`host.name` String, `@timestamp` DateTime) +`json` JSON(`host.name` String, `@timestamp` DateTime) ) ENGINE = MergeTree ORDER BY (`json.host.name`, `json.@timestamp`) ``` - :::note We provide a type hint for the `host.name` and `timestamp` columns in the JSON definition as we use it in the ordering/primary key. This helps ClickHouse know this column won't be null and ensures it knows which sub-columns to use (there may be multiple for each type, so this is ambiguous otherwise). ::: - This latter approach, while simpler, is best for prototyping and data engineering tasks. For production, use `JSON` only for dynamic sub structures where necessary. - For more details on using the JSON type in schemas, and how to efficiently apply it, we recommend the guide ["Designing your schema"](/integrations/data-formats/json/schema). - ### Install `elasticdump` {#install-elasticdump} - We recommend [`elasticdump`](https://github.com/elasticsearch-dump/elasticsearch-dump) for exporting data from Elasticsearch. This tool requires `node` and should be installed on a machine with network proximity to both Elasticsearch and ClickHouse. We recommend a dedicated server with at least 4 cores and 16GB of RAM for most exports. - ```shell npm install elasticdump -g ``` - `elasticdump` offers several advantages for data migration: - - It interacts directly with the Elasticsearch REST API, ensuring proper data export. - Maintains data consistency during the export process using the Point-in-Time (PIT) API - this creates a consistent snapshot of the data at a specific moment. - Exports data directly to JSON format, which can be streamed to the ClickHouse client for insertion. - Where possible, we recommend running both ClickHouse, Elasticsearch, and `elastic dump` in the same availability zone or data center to minimize network egress and maximize throughput. - ### Install ClickHouse client {#install-clickhouse-client} - Ensure ClickHouse is [installed on the server](/install) on which `elasticdump` is located. **Do not start a ClickHouse server** - these steps only require the client. - ### Stream data {#stream-data} - To stream data between Elasticsearch and ClickHouse, use the `elasticdump` command - piping the output directly to the ClickHouse client. The following inserts the data into our well structured table `logs_system_syslog`. - ```shell # export url and credentials export ELASTICSEARCH_INDEX=.ds-logs-system.syslog-default-2025.06.03-000001 @@ -584,69 +546,51 @@ export ELASTICDUMP_INPUT_PASSWORD= export CLICKHOUSE_HOST= export CLICKHOUSE_PASSWORD= export CLICKHOUSE_USER=default - # command to run - modify as required -elasticdump --input=${ELASTICSEARCH_URL} --type=data --input-index ${ELASTICSEARCH_INDEX} --output=$ --sourceOnly --searchAfter --pit=true | +elasticdump --input=${ELASTICSEARCH_URL} --type=data --input-index ${ELASTICSEARCH_INDEX} --output=$ --sourceOnly --searchAfter --pit=true | clickhouse-client --host ${CLICKHOUSE_HOST} --secure --password ${CLICKHOUSE_PASSWORD} --user ${CLICKHOUSE_USER} --max_insert_block_size=1000 \ --min_insert_block_size_bytes=0 --min_insert_block_size_rows=1000 --query="INSERT INTO test.logs_system_syslog FORMAT JSONEachRow" ``` - Note the use of the following flags for `elasticdump`: - - `type=data` - limits the response to only the document content in Elasticsearch. - `input-index` - our Elasticsearch input index. - `output=$` - redirects all results to stdout. - `sourceOnly` flag ensuring we omit metadata fields in our response. - `searchAfter` flag to use the [`searchAfter` API](https://www.elastic.co/docs/reference/elasticsearch/rest-apis/paginate-search-results#search-after) for efficient pagination of results. - `pit=true` to ensure consistent results between queries using the [point in time API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-open-point-in-time). -
    Our ClickHouse client parameters here (aside from credentials): - - `max_insert_block_size=1000` - ClickHouse client will send data once this number of rows is reached. Increasing improves throughput at the expense of time to formulate a block - thus increasing time till data appears in ClickHouse. - `min_insert_block_size_bytes=0` - Turns off server block squashing by bytes. - `min_insert_block_size_rows=1000` - Squashes blocks from clients on the server side. In this case, we set to `max_insert_block_size` so rows appear immediately. Increase to improve throughput. - `query="INSERT INTO logs_system_syslog FORMAT JSONAsRow"` - Inserting the data as [JSONEachRow format](/integrations/data-formats/json/other-formats). This is appropriate if sending to a well-defined schema such as `logs_system_syslog.` -
    **Users can expect throughput in order of thousands of rows per second.** - :::note Inserting into single JSON row If inserting into a single JSON column (see the `syslog_json` schema above), the same insert command can be used. However, users must specify `JSONAsObject` as the format instead of `JSONEachRow` e.g. - ```shell -elasticdump --input=${ELASTICSEARCH_URL} --type=data --input-index ${ELASTICSEARCH_INDEX} --output=$ --sourceOnly --searchAfter --pit=true | +elasticdump --input=${ELASTICSEARCH_URL} --type=data --input-index ${ELASTICSEARCH_INDEX} --output=$ --sourceOnly --searchAfter --pit=true | clickhouse-client --host ${CLICKHOUSE_HOST} --secure --password ${CLICKHOUSE_PASSWORD} --user ${CLICKHOUSE_USER} --max_insert_block_size=1000 \ --min_insert_block_size_bytes=0 --min_insert_block_size_rows=1000 --query="INSERT INTO test.logs_system_syslog FORMAT JSONAsObject" ``` - See ["Reading JSON as an object"](/integrations/data-formats/json/other-formats#reading-json-as-an-object) for further details. ::: - ### Transform data (optional) {#transform-data} - The above commands assume a 1:1 mapping of Elasticsearch fields to ClickHouse columns. Users often need to filter and transform Elasticsearch data before insertion into ClickHouse. - This can be achieved using the [`input`](/sql-reference/table-functions/input) table function, which allows us to execute any `SELECT` query on the stdout. - Suppose we wish to only store the `timestamp` and `hostname` fields from our earlier data. The ClickHouse schema: - ```sql CREATE TABLE logs_system_syslog_v2 ( - `timestamp` DateTime, - `hostname` String +`timestamp` DateTime, +`hostname` String ) ENGINE = MergeTree ORDER BY (hostname, timestamp) ``` - To insert from `elasticdump` into this table, we can simply use the `input` table function - using the JSON type to dynamically detect and select the required columns. Note this `SELECT` query could easily contain a filter. - ```shell elasticdump --input=${ELASTICSEARCH_URL} --type=data --input-index ${ELASTICSEARCH_INDEX} --output=$ --sourceOnly --searchAfter --pit=true | clickhouse-client --host ${CLICKHOUSE_HOST} --secure --password ${CLICKHOUSE_PASSWORD} --user ${CLICKHOUSE_USER} --max_insert_block_size=1000 \ --min_insert_block_size_bytes=0 --min_insert_block_size_rows=1000 --query="INSERT INTO test.logs_system_syslog_v2 SELECT json.\`@timestamp\` as timestamp, json.host.hostname as hostname FROM input('json JSON') FORMAT JSONAsObject" ``` - Note the need to escape the `@timestamp` field name and use the `JSONAsObject` input format. -
    diff --git a/docs/use-cases/observability/clickstack/migration/elastic/migrating-sdks.md b/docs/use-cases/observability/clickstack/migration/elastic/migrating-sdks.md index 9b3e8d5c166..d857874410f 100644 --- a/docs/use-cases/observability/clickstack/migration/elastic/migrating-sdks.md +++ b/docs/use-cases/observability/clickstack/migration/elastic/migrating-sdks.md @@ -19,9 +19,9 @@ The Elastic Stack provides two types of language SDKs for instrumenting applicat 2. **[Elastic Distributions of OpenTelemetry (EDOT SDKs)](https://www.elastic.co/docs/reference/opentelemetry/edot-sdks/)** – These are Elastic's distributions of the standard OpenTelemetry SDKs, available for .NET, Java, Node.js, PHP, and Python. If your application is already using an EDOT SDK, you do not need to re-instrument your code. Instead, you can simply reconfigure the SDK to export telemetry data to the OTLP Collector included in ClickStack. See ["Migrating EDOT SDKs"](#migrating-edot-sdks) for further details. -:::note Use ClickStack SDKs where possible -While standard OpenTelemetry SDKs are supported, we strongly recommend using the [**ClickStack-distributed SDKs**](/use-cases/observability/clickstack/sdks) for each language. These distributions include additional instrumentation, enhanced defaults, and custom extensions designed to work seamlessly with the ClickStack pipeline and HyperDX UI. By using the ClickStack SDKs, you can unlock advanced features such as exception stack traces that are not available with vanilla OpenTelemetry or EDOT SDKs. -::: + :::note Use ClickStack SDKs where possible + While standard OpenTelemetry SDKs are supported, we strongly recommend using the [**ClickStack-distributed SDKs**](/use-cases/observability/clickstack/sdks) for each language. These distributions include additional instrumentation, enhanced defaults, and custom extensions designed to work seamlessly with the ClickStack pipeline and HyperDX UI. By using the ClickStack SDKs, you can unlock advanced features such as exception stack traces that are not available with vanilla OpenTelemetry or EDOT SDKs. + ::: ## Migrating EDOT SDKs {#migrating-edot-sdks} diff --git a/docs/use-cases/observability/clickstack/migration/elastic/search.md b/docs/use-cases/observability/clickstack/migration/elastic/search.md index 9a399f629c0..4d1091f3917 100644 --- a/docs/use-cases/observability/clickstack/migration/elastic/search.md +++ b/docs/use-cases/observability/clickstack/migration/elastic/search.md @@ -12,10 +12,9 @@ import Image from '@theme/IdealImage'; import hyperdx_search from '@site/static/images/use-cases/observability/hyperdx-search.png'; import hyperdx_sql from '@site/static/images/use-cases/observability/hyperdx-sql.png'; - ## Search in ClickStack and Elastic {#search-in-clickstack-and-elastic} -ClickHouse is a SQL-native engine, designed from the ground up for high-performance analytical workloads. In contrast, Elasticsearch provides a SQL-like interface, transpiling SQL into the underlying Elasticsearch query DSL — meaning it is not a first-class citizen, and [feature parity](https://www.elastic.co/docs/explore-analyze/query-filter/languages/sql-limitations) is limited. +ClickHouse is a SQL-native engine, designed from the ground up for high-performance analytical workloads. In contrast, Elasticsearch provides a SQL-like interface, transpiling SQL into the underlying Elasticsearch query DSL — meaning it is not a first-class citizen, and [feature parity](https://www.elastic.co/docs/explore-analyze/query-filter/languages/sql-limitations) is limited. ClickHouse not only supports full SQL but extends it with a range of observability-focused functions, such as [`argMax`](/sql-reference/aggregate-functions/reference/argmax), [`histogram`](/sql-reference/aggregate-functions/parametric-functions#histogram), and [`quantileTiming`](/sql-reference/aggregate-functions/reference/quantiletiming), that simplify querying structured logs, metrics, and traces. @@ -55,7 +54,6 @@ Both HyperDX and Elasticsearch provide flexible query languages to enable intuit | Field wildcard | `service.*:error` | `service.*:error` | Not supported in HyperDX at present. | | Escaped special chars | Escape reserved characters with `\` | Same | Escaping required for reserved symbols. | - ## Exists/missing differences {#empty-value-differences} Unlike Elasticsearch, where a field can be entirely omitted from an event and therefore truly "not exist," ClickHouse requires all columns in a table schema to exist. If a field is not provided in an insert event: @@ -63,10 +61,10 @@ Unlike Elasticsearch, where a field can be entirely omitted from an event and th - For [`Nullable`](/sql-reference/data-types/nullable) fields, it will be set to `NULL`. - For non-nullable fields (the default), it will be populated with a default value (often an empty string, 0, or equivalent). -In ClickStack, we use the latter as [`Nullable`](/sql-reference/data-types/nullable) is [not recommended](/optimize/avoid-nullable-columns). + In ClickStack, we use the latter as [`Nullable`](/sql-reference/data-types/nullable) is [not recommended](/optimize/avoid-nullable-columns). -This behavior means that checking whether a field "exists”" in the Elasticsearch sense is not directly supported. + This behavior means that checking whether a field "exists”" in the Elasticsearch sense is not directly supported. -Instead, users can use `field:*` or `field != ''` to check for the presence of a non-empty value. It is thus not possible to distinguish between truly missing and explicitly empty fields. + Instead, users can use `field:*` or `field != ''` to check for the presence of a non-empty value. It is thus not possible to distinguish between truly missing and explicitly empty fields. -In practice, this difference rarely causes issues for observability use cases, but it's important to keep in mind when translating queries between systems. + In practice, this difference rarely causes issues for observability use cases, but it's important to keep in mind when translating queries between systems. diff --git a/docs/use-cases/observability/clickstack/migration/elastic/types.md b/docs/use-cases/observability/clickstack/migration/elastic/types.md index 43fd32f145d..8e7dc220a24 100644 --- a/docs/use-cases/observability/clickstack/migration/elastic/types.md +++ b/docs/use-cases/observability/clickstack/migration/elastic/types.md @@ -12,7 +12,6 @@ keywords: ['JSON', 'Codecs'] Elasticsearch and ClickHouse support a wide variety of data types, but their underlying storage and query models are fundamentally different. This section maps commonly used Elasticsearch field types to their ClickHouse equivalents, where available, and provides context to help guide migrations. Where no equivalent exists, alternatives or notes are provided in the comments. - | **Elasticsearch Type** | **ClickHouse Equivalent** | **Comments** | |-------------------------------|------------------------------|--------------| | `boolean` | [`UInt8`](/sql-reference/data-types/int-uint) or [`Bool`](/sql-reference/data-types/boolean) | ClickHouse supports `Boolean` as an alias for `UInt8` in newer versions. | diff --git a/docs/use-cases/observability/clickstack/overview.md b/docs/use-cases/observability/clickstack/overview.md index 504f525e78b..8c43853c131 100644 --- a/docs/use-cases/observability/clickstack/overview.md +++ b/docs/use-cases/observability/clickstack/overview.md @@ -45,9 +45,9 @@ ClickStack consists of three core components: 2. **OpenTelemetry collector** – a custom-built, preconfigured collector with an opinionated schema for logs, traces, and metrics 3. **ClickHouse** – the high-performance analytical database at the heart of the stack -These components can be deployed independently or together. A browser-hosted version of the HyperDX UI is also available, allowing users to connect to existing ClickHouse deployments without additional infrastructure. + These components can be deployed independently or together. A browser-hosted version of the HyperDX UI is also available, allowing users to connect to existing ClickHouse deployments without additional infrastructure. -To get started, visit the [Getting started guide](/use-cases/observability/clickstack/getting-started) before loading a [sample dataset](/use-cases/observability/clickstack/sample-datasets). You can also explore documentation on [deployment options](/use-cases/observability/clickstack/deployment) and [production best practices](/use-cases/observability/clickstack/production). + To get started, visit the [Getting started guide](/use-cases/observability/clickstack/getting-started) before loading a [sample dataset](/use-cases/observability/clickstack/sample-datasets). You can also explore documentation on [deployment options](/use-cases/observability/clickstack/deployment) and [production best practices](/use-cases/observability/clickstack/production). ## Principles {#clickstack-principles} @@ -83,17 +83,17 @@ ClickStack is fully open source and can be deployed anywhere. The schema is flex ClickStack consists of three core components: -1. **HyperDX UI** - A user-friendly interface built for observability. It supports both Lucene-style and SQL queries, interactive dashboards, alerting, trace exploration, and more—all optimized for ClickHouse as the backend. +1. **HyperDX UI** + A user-friendly interface built for observability. It supports both Lucene-style and SQL queries, interactive dashboards, alerting, trace exploration, and more—all optimized for ClickHouse as the backend. -2. **OpenTelemetry collector** - A custom-built collector configured with an opinionated schema optimized for ClickHouse ingestion. It receives logs, metrics, and traces via OpenTelemetry protocols and writes them directly to ClickHouse using efficient batched inserts. +2. **OpenTelemetry collector** + A custom-built collector configured with an opinionated schema optimized for ClickHouse ingestion. It receives logs, metrics, and traces via OpenTelemetry protocols and writes them directly to ClickHouse using efficient batched inserts. -3. **ClickHouse** - The high-performance analytical database that serves as the central data store for wide events. ClickHouse powers fast search, filtering, and aggregation at scale, leveraging its columnar engine and native support for JSON. +3. **ClickHouse** + The high-performance analytical database that serves as the central data store for wide events. ClickHouse powers fast search, filtering, and aggregation at scale, leveraging its columnar engine and native support for JSON. -In addition to these three components, ClickStack uses a **MongoDB instance** to store application state such as dashboards, user accounts, and configuration settings. + In addition to these three components, ClickStack uses a **MongoDB instance** to store application state such as dashboards, user accounts, and configuration settings. -A full architectural diagram and deployment details can be found in the [Architecture section](/use-cases/observability/clickstack/architecture). + A full architectural diagram and deployment details can be found in the [Architecture section](/use-cases/observability/clickstack/architecture). -For users interesting in deploying ClickStack to production, we recommend reading the ["Production"](/use-cases/observability/clickstack/production) guide. + For users interesting in deploying ClickStack to production, we recommend reading the ["Production"](/use-cases/observability/clickstack/production) guide. diff --git a/docs/use-cases/observability/clickstack/production.md b/docs/use-cases/observability/clickstack/production.md index 4fd11eadf10..62f4d7cd1d6 100644 --- a/docs/use-cases/observability/clickstack/production.md +++ b/docs/use-cases/observability/clickstack/production.md @@ -47,25 +47,25 @@ Here's how to add it to your `docker-compose.yml` file for the app service: ports: - ${HYPERDX_API_PORT}:${HYPERDX_API_PORT} - ${HYPERDX_APP_PORT}:${HYPERDX_APP_PORT} - environment: - FRONTEND_URL: ${HYPERDX_APP_URL}:${HYPERDX_APP_PORT} - HYPERDX_API_KEY: ${HYPERDX_API_KEY} - HYPERDX_API_PORT: ${HYPERDX_API_PORT} - HYPERDX_APP_PORT: ${HYPERDX_APP_PORT} - HYPERDX_APP_URL: ${HYPERDX_APP_URL} - HYPERDX_LOG_LEVEL: ${HYPERDX_LOG_LEVEL} - MINER_API_URL: 'http://miner:5123' - MONGO_URI: 'mongodb://db:27017/hyperdx' - NEXT_PUBLIC_SERVER_URL: http://127.0.0.1:${HYPERDX_API_PORT} - OTEL_SERVICE_NAME: 'hdx-oss-api' - USAGE_STATS_ENABLED: ${USAGE_STATS_ENABLED:-true} - EXPRESS_SESSION_SECRET: "super-secure-random-string" - networks: + environment: + FRONTEND_URL: ${HYPERDX_APP_URL}:${HYPERDX_APP_PORT} + HYPERDX_API_KEY: ${HYPERDX_API_KEY} + HYPERDX_API_PORT: ${HYPERDX_API_PORT} + HYPERDX_APP_PORT: ${HYPERDX_APP_PORT} + HYPERDX_APP_URL: ${HYPERDX_APP_URL} + HYPERDX_LOG_LEVEL: ${HYPERDX_LOG_LEVEL} + MINER_API_URL: 'http://miner:5123' + MONGO_URI: 'mongodb://db:27017/hyperdx' + NEXT_PUBLIC_SERVER_URL: http://127.0.0.1:${HYPERDX_API_PORT} + OTEL_SERVICE_NAME: 'hdx-oss-api' + USAGE_STATS_ENABLED: ${USAGE_STATS_ENABLED:-true} + EXPRESS_SESSION_SECRET: "super-secure-random-string" + networks: - internal - depends_on: + depends_on: - ch-server - db1 -``` + ``` You can generate a strong secret using openssl: @@ -98,7 +98,7 @@ The ClickHouse user for HyperDX only needs to be a `readonly` user with access t - `cancel_http_readonly_queries_on_client_close` - `wait_end_of_query` -By default the `default` user in both OSS and ClickHouse Cloud will have these permissions available but we recommend you create a new user with these permissions. + By default the `default` user in both OSS and ClickHouse Cloud will have these permissions available but we recommend you create a new user with these permissions. #### Database and ingestion user {#database-ingestion-user} @@ -121,7 +121,7 @@ ClickHouse OSS provides robust security features out of the box. However, these - **Avoid hard coding credentials.** Use [named collections](/operations/named-collections) or IAM roles in ClickHouse Cloud. - **Audit access and queries** using [system logs](/operations/system-tables/query_log) and [session logs](/operations/system-tables/session_log). -See also [external authenticators](/operations/external-authenticators) and [query complexity settings](/operations/settings/query-complexity) for managing users and ensuring query/resource limits. + See also [external authenticators](/operations/external-authenticators) and [query complexity settings](/operations/settings/query-complexity) for managing users and ensuring query/resource limits. ### Configure Time To Live (TTL) {#configure-ttl} @@ -136,66 +136,39 @@ Follow the official [MongoDB security checklist](https://www.mongodb.com/docs/ma The following represents a simple deployment of ClickStack using ClickHouse Cloud which meets best practices. - ### Create a service {#create-a-service} - Follow the [getting started guide for ClickHouse Cloud](/getting-started/quick-start/cloud/#1-create-a-clickhouse-service) to create a service. - ### Copy connection details {#copy-connection-details} - To find the connection details for HyperDX, navigate to the ClickHouse Cloud console and click the Connect button on the sidebar recording the HTTP connection details specifically the url. - **While you may use the default username and password shown in this step to connect HyperDX, we recommend creating a dedicated user - see below** - Connect Cloud - ### Create a HyperDX user {#create-a-user} - We recommend you create a dedicated user for HyperDX. Run the following SQL commands in the [Cloud SQL console](/cloud/get-started/sql-console), providing a secure password which meets complexity requirements: - ```sql CREATE USER hyperdx IDENTIFIED WITH sha256_password BY '' SETTINGS PROFILE 'readonly'; GRANT sql_console_read_only TO hyperdx; ``` - ### Prepare for ingestion user {#prepare-for-ingestion} - Create an `otel` database for data and a `hyperdx_ingest` user for ingestion with limited permissions. - ```sql CREATE DATABASE otel; CREATE USER hyperdx_ingest IDENTIFIED WITH sha256_password BY 'ClickH0u3eRocks123!'; GRANT SELECT, INSERT, CREATE TABLE, CREATE VIEW ON otel.* TO hyperdx_ingest; ``` - ### Deploy ClickStack {#deploy-clickstack} - -Deploy ClickStack - the [Helm](/use-cases/observability/clickstack/deployment/helm) or [Docker Compose](/use-cases/observability/clickstack/deployment/docker-compose) (modified to exclude ClickHouse) deployment models are preferred. - +Deploy ClickStack - the [Helm](/use-cases/observability/clickstack/deployment/helm) or [Docker Compose](/use-cases/observability/clickstack/deployment/docker-compose) (modified to exclude ClickHouse) deployment models are preferred. :::note Deploying components separately Advanced users can deploy the [OTel collector](/use-cases/observability/clickstack/ingesting-data/opentelemetry#standalone) and [HyperDX](/use-cases/observability/clickstack/deployment/hyperdx-only) separately with their respective standalone deployment modes. ::: - Instructions for using ClickHouse Cloud with the Helm chart can be found [here](/use-cases/observability/clickstack/deployment/helm#using-clickhouse-cloud). Equivalent instructions for Docker Compose can be found [here](/use-cases/observability/clickstack/deployment/docker-compose). - ### Navigate to the HyperDX UI {#navigate-to-hyperdx-ui} - Visit [http://localhost:8080](http://localhost:8080) to access the HyperDX UI. - -Create a user, providing a username and password which meets the requirements. - +Create a user, providing a username and password which meets the requirements. HyperDX UI - On clicking `Create` you'll be prompted for connection details. - ### Connect to ClickHouse Cloud {#connect-to-clickhouse-cloud} - Using the credentials created earlier, complete the connection details and click `Create`. - HyperDX Cloud - ### Send data to ClickStack {#send-data} - To send data to ClickStack see ["Sending OpenTelemetry data"](/use-cases/observability/clickstack/ingesting-data/opentelemetry#sending-otel-data). - diff --git a/docs/use-cases/observability/clickstack/search.md b/docs/use-cases/observability/clickstack/search.md index 4099ac8abd3..b6244a5c50f 100644 --- a/docs/use-cases/observability/clickstack/search.md +++ b/docs/use-cases/observability/clickstack/search.md @@ -19,35 +19,35 @@ as well. - Searches are not case sensitive - Searches match by whole word by default (ex. `Error` will match `Error here` - but not `Errors here`). You can surround a word by wildcards to match partial - words (ex. `*Error*` will match `AnyError` and `AnyErrors`) + but not `Errors here`). You can surround a word by wildcards to match partial + words (ex. `*Error*` will match `AnyError` and `AnyErrors`) - Search terms are searched in any order (ex. `Hello World` will match logs that - contain `Hello World` and `World Hello`) + contain `Hello World` and `World Hello`) - You can exclude keywords by using `NOT` or `-` (ex. `Error NOT Exception` or - `Error -Exception`) + `Error -Exception`) - You can use `AND` and `OR` to combine multiple keywords (ex. - `Error OR Exception`) + `Error OR Exception`) - Exact matches can be done via double quotes (ex. `"Error tests not found"`) -Search + Search ### Column/property search {#column-search} - You can search columns and JSON/map properties by using `column:value` (ex. `level:Error`, - `service:app`) + `service:app`) - You can search for a range of values by using comparison operators (`>`, `<`, - `>=`, `<=`) (ex. `Duration:>1000`) + `>=`, `<=`) (ex. `Duration:>1000`) - You can search for the existence of a property by using `property:*` (ex. - `duration:*`) + `duration:*`) ## Time input {#time-input} - Time input accepts natural language inputs (ex. `1 hour ago`, `yesterday`, - `last week`) + `last week`) - Specifying a single point in time will result in searching from that point in - time up until now. + time up until now. - Time range will always be converted into the parsed time range upon search for - easy debugging of time queries. + easy debugging of time queries. - You can highlight a histogram bar to zoom into a specific time range as well. ## SQL search syntax {#sql-syntax} diff --git a/docs/use-cases/observability/clickstack/ttl.md b/docs/use-cases/observability/clickstack/ttl.md index a11ba3eef2e..31a61d546d2 100644 --- a/docs/use-cases/observability/clickstack/ttl.md +++ b/docs/use-cases/observability/clickstack/ttl.md @@ -54,13 +54,13 @@ TTL TimestampTime + toIntervalDay(3) SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1 ``` -Partitioning in ClickHouse allows data to be logically separated on disk according to a column or SQL expression. By separating data logically, each partition can be operated on independently e.g. deleted when it expires according to a TTL policy. +Partitioning in ClickHouse allows data to be logically separated on disk according to a column or SQL expression. By separating data logically, each partition can be operated on independently e.g. deleted when it expires according to a TTL policy. As shown in the above example, partitioning is specified on a table when it is initially defined via the `PARTITION BY` clause. This clause can contain an SQL expression on any column/s, the results of which will define which partition a row is sent to. This causes data to be logically associated (via a common folder name prefix) with each partition on the disk, which can then be queried in isolation. For the example above, the default `otel_logs` schema partitions by day using the expression `toDate(Timestamp).` As rows are inserted into ClickHouse, this expression will be evaluated against each row and routed to the resulting partition if it exists (if the row is the first for a day, the partition will be created). For further details on partitioning and its other applications, see ["Table Partitions"](/partitions). Partitions -The table schema also includes a `TTL TimestampTime + toIntervalDay(3)` and setting `ttl_only_drop_parts = 1`. The former clause ensures data will be dropped once it is older than 3 days. The setting `ttl_only_drop_parts = 1` enforces only expiring data parts where all of the data has expired (vs. attempting to partially delete rows). With partitioning ensuring data from separate days is never "merged," data can thus be efficiently dropped. +The table schema also includes a `TTL TimestampTime + toIntervalDay(3)` and setting `ttl_only_drop_parts = 1`. The former clause ensures data will be dropped once it is older than 3 days. The setting `ttl_only_drop_parts = 1` enforces only expiring data parts where all of the data has expired (vs. attempting to partially delete rows). With partitioning ensuring data from separate days is never "merged," data can thus be efficiently dropped. :::important `ttl_only_drop_parts` We recommend always using the setting [`ttl_only_drop_parts=1`](/operations/settings/merge-tree-settings#ttl_only_drop_parts). When this setting is enabled, ClickHouse drops a whole part when all rows in it are expired. Dropping whole parts instead of partial cleaning TTL-d rows (achieved through resource-intensive mutations when `ttl_only_drop_parts=0`) allows having shorter `merge_with_ttl_timeout` times and lower impact on system performance. If data is partitioned by the same unit at which you perform TTL expiration e.g. day, parts will naturally only contain data from the defined interval. This will ensure `ttl_only_drop_parts=1` can be efficiently applied. @@ -78,19 +78,19 @@ To modify TTL users can either: 1. **Modify the table schemas (recommended)**. This requires connecting to the ClickHouse instance e.g. using the [clickhouse-client](/interfaces/cli) or [Cloud SQL Console](/cloud/get-started/sql-console). For example, we can modify the TTL for the `otel_logs` table using the following DDL: -```sql -ALTER TABLE default.otel_logs -MODIFY TTL TimestampTime + toIntervalDay(7); -``` + ```sql + ALTER TABLE default.otel_logs + MODIFY TTL TimestampTime + toIntervalDay(7); + ``` 2. **Modify the OTel collector**. The ClickStack OpenTelemetry collector creates tables in ClickHouse if they do not exist. This is achieved via the ClickHouse exporter, which itself exposes a `ttl` parameter used for controlling the default TTL expression e.g. -```yaml -exporters: - clickhouse: - endpoint: tcp://localhost:9000?dial_timeout=10s&compress=lz4&async_insert=1 - ttl: 72h -``` + ```yaml + exporters: + clickhouse: + endpoint: tcp://localhost:9000?dial_timeout=10s&compress=lz4&async_insert=1 + ttl: 72h + ``` ### Column level TTL {#column-level-ttl} diff --git a/docs/use-cases/observability/index.md b/docs/use-cases/observability/index.md index 31b03f1448c..3845ba84e8c 100644 --- a/docs/use-cases/observability/index.md +++ b/docs/use-cases/observability/index.md @@ -27,7 +27,6 @@ The ClickHouse Observability Stack is our **recommended approach** for most user | [Search](/use-cases/observability/clickstack/search) | How to search and query your observability data | | [Production](/use-cases/observability/clickstack/production) | Best practices for production deployment | - ## Build-your-own stack {#build-your-own-stack} For users with **custom requirements** — such as highly specialized ingestion pipelines, schema designs, or extreme scaling needs — we provide guidance to build a custom observability stack with ClickHouse as the core database. diff --git a/docs/use-cases/time-series/analysis-functions.md b/docs/use-cases/time-series/analysis-functions.md index 64329b30817..fcc3859609f 100644 --- a/docs/use-cases/time-series/analysis-functions.md +++ b/docs/use-cases/time-series/analysis-functions.md @@ -9,20 +9,20 @@ show_related_blogs: true # Time-series analysis functions -Time series analysis in ClickHouse can be performed using standard SQL aggregation and window functions. +Time series analysis in ClickHouse can be performed using standard SQL aggregation and window functions. When working with time series data, you'll typically encounter three main types of metrics: * Counter metrics that monotonically increase over time (like page views or total events) * Gauge metrics that represent point-in-time measurements that can go up and down (like CPU usage or temperature) * Histograms that sample observations and count them in buckets (like request durations or response sizes) -Common analysis patterns for these metrics include comparing values between periods, calculating cumulative totals, determining rates of change, and analyzing distributions. -These can all be achieved through combinations of aggregations, window functions like `sum() OVER`, and specialized functions like `histogram()`. + Common analysis patterns for these metrics include comparing values between periods, calculating cumulative totals, determining rates of change, and analyzing distributions. + These can all be achieved through combinations of aggregations, window functions like `sum() OVER`, and specialized functions like `histogram()`. ## Period-over-period changes {#time-series-period-over-period-changes} -When analyzing time series data, we often need to understand how values change between time periods. -This is essential for both gauge and counter metrics. +When analyzing time series data, we often need to understand how values change between time periods. +This is essential for both gauge and counter metrics. The [`lagInFrame`](/docs/sql-reference/window-functions/lagInFrame) window function lets us access the previous period's value to calculate these changes. The following query demonstrates this by calculating day-over-day changes in views for "Weird Al" Yankovic's Wikipedia page. @@ -57,10 +57,10 @@ LIMIT 10; ## Cumulative values {#time-series-cumulative-values} -Counter metrics naturally accumulate over time. +Counter metrics naturally accumulate over time. To analyze this cumulative growth, we can calculate running totals using window functions. -The following query demonstrates this by using the `sum() OVER` clause creates a running total, while the `bar()` function provides a visual representation of the growth. +The following query demonstrates this by using the `sum() OVER` clause creates a running total, while the `bar()` function provides a visual representation of the growth. ```sql SELECT @@ -92,11 +92,10 @@ LIMIT 10; ## Rate calculations {#time-series-rate-calculations} -When analyzing time series data, it's often useful to understand the rate of events per unit of time. -This query calculates the rate of page views per second by dividing hourly totals by the number of seconds in an hour (3600). +When analyzing time series data, it's often useful to understand the rate of events per unit of time. +This query calculates the rate of page views per second by dividing hourly totals by the number of seconds in an hour (3600). The visual bar helps identify peak hours of activity. - ```sql SELECT toStartOfHour(time) AS time, @@ -109,7 +108,6 @@ GROUP BY time LIMIT 10; ``` - ```text ┌────────────────time─┬───h─┬─rate─┬─b─────┐ │ 2015-07-01 01:00:00 │ 143 │ 0.04 │ █▊ │ @@ -127,7 +125,7 @@ LIMIT 10; ## Histograms {#time-series-histograms} -A popular use case for time series data is to build histograms based on tracked events. +A popular use case for time series data is to build histograms based on tracked events. Suppose we wanted to understand the distribution of a number of pages based on their total hits, only including pages that have over 10,000 hits. We can use the `histogram()` function to automatically generate an adaptive histogram based on the number of bins: @@ -155,7 +153,6 @@ hist: [(10033,23224.55065359477,60.625),(23224.55065359477,37855.38888888889,15. We can then use [`arrayJoin()`](/docs/sql-reference/functions/array-join) to massage the data and `bar()` to visualize it: - ```sql WITH histogram(10)(hits) AS hist SELECT diff --git a/docs/use-cases/time-series/basic-operations.md b/docs/use-cases/time-series/basic-operations.md index b4652c167f7..37ccee6481c 100644 --- a/docs/use-cases/time-series/basic-operations.md +++ b/docs/use-cases/time-series/basic-operations.md @@ -9,10 +9,10 @@ show_related_blogs: true # Basic time-series operations -ClickHouse provides several methods for working with time series data, allowing you to aggregate, group, and analyze data points across different time periods. +ClickHouse provides several methods for working with time series data, allowing you to aggregate, group, and analyze data points across different time periods. This section covers the fundamental operations commonly used when working with time-based data. -Common operations include grouping data by time intervals, handling gaps in time series data, and calculating changes between time periods. +Common operations include grouping data by time intervals, handling gaps in time series data, and calculating changes between time periods. These operations can be performed using standard SQL syntax combined with ClickHouse's built-in time functions. We're going to explore ClickHouse time-series querying capabilities with the Wikistat (Wikipedia pageviews data) dataset: @@ -33,9 +33,9 @@ ORDER BY (time); Let's populate this table with 1 billion records: ```sql -INSERT INTO wikistat +INSERT INTO wikistat SELECT * -FROM s3('https://ClickHouse-public-datasets.s3.amazonaws.com/wikistat/partitioned/wikistat*.native.zst') +FROM s3('https://ClickHouse-public-datasets.s3.amazonaws.com/wikistat/partitioned/wikistat*.native.zst') LIMIT 1e9; ``` @@ -65,11 +65,10 @@ LIMIT 5; We've used the [`toDate()`](/sql-reference/functions/type-conversion-functions#todate) function here, which converts the specified time to a date type. Alternatively, we can batch by an hour and filter on the specific date: - ```sql SELECT toStartOfHour(time) AS hour, - sum(hits) AS hits + sum(hits) AS hits FROM wikistat WHERE date(time) = '2015-07-01' GROUP BY ALL @@ -87,12 +86,12 @@ LIMIT 5; └─────────────────────┴────────┘ ``` -The [`toStartOfHour()`](/docs/sql-reference/functions/date-time-functions#tostartofhour) function used here converts the given time to the start of the hour. +The [`toStartOfHour()`](/docs/sql-reference/functions/date-time-functions#tostartofhour) function used here converts the given time to the start of the hour. You can also group by year, quarter, month, or day. ## Custom grouping intervals {#time-series-custom-grouping-intervals} -We can even group by arbitrary intervals, e.g., 5 minutes using the [`toStartOfInterval()`](/docs/sql-reference/functions/date-time-functions#tostartofinterval) function. +We can even group by arbitrary intervals, e.g., 5 minutes using the [`toStartOfInterval()`](/docs/sql-reference/functions/date-time-functions#tostartofinterval) function. Let's say we want to group by 4-hour intervals. We can specify the grouping interval using the [`INTERVAL`](/docs/sql-reference/data-types/special-data-types/interval) clause: @@ -216,14 +215,14 @@ ORDER BY hour ASC WITH FILL STEP toIntervalHour(1); ## Rolling time windows {#time-series-rolling-time-windows} -Sometimes, we don't want to deal with the start of intervals (like the start of the day or an hour) but window intervals. -Let's say we want to understand the total hits for a window, not based on days but on a 24-hour period offset from 6 pm. +Sometimes, we don't want to deal with the start of intervals (like the start of the day or an hour) but window intervals. +Let's say we want to understand the total hits for a window, not based on days but on a 24-hour period offset from 6 pm. -We can use the [`date_diff()`](/docs/sql-reference/functions/date-time-functions#date_diff) function to calculate the difference between a reference time and each record's time. +We can use the [`date_diff()`](/docs/sql-reference/functions/date-time-functions#date_diff) function to calculate the difference between a reference time and each record's time. In this case, the `day` column will represent the difference in days (e.g., 1 day ago, 2 days ago, etc.): ```sql -SELECT +SELECT dateDiff('day', toDateTime('2015-05-01 18:00:00'), time) AS day, sum(hits), FROM wikistat diff --git a/docs/use-cases/time-series/date-time-data-types.md b/docs/use-cases/time-series/date-time-data-types.md index fae37ceba9e..ce0a7e0343e 100644 --- a/docs/use-cases/time-series/date-time-data-types.md +++ b/docs/use-cases/time-series/date-time-data-types.md @@ -9,13 +9,13 @@ show_related_blogs: true # Date and time data types -Having a comprehensive suite of date and time types is necessary for effective time series data management, and ClickHouse delivers exactly that. +Having a comprehensive suite of date and time types is necessary for effective time series data management, and ClickHouse delivers exactly that. From compact date representations to high-precision timestamps with nanosecond accuracy, these types are designed to balance storage efficiency with practical requirements for different time series applications. -Whether you're working with historical financial data, IoT sensor readings, or future-dated events, ClickHouse's date and time types provide the flexibility needed to handle various temporal data scenarios. +Whether you're working with historical financial data, IoT sensor readings, or future-dated events, ClickHouse's date and time types provide the flexibility needed to handle various temporal data scenarios. The range of supported types allows you to optimize both storage space and query performance while maintaining the precision your use case demands. -* The [`Date`](/sql-reference/data-types/date) type should be sufficient in most cases. This type requires 2 bytes to store a date and limits the range to `[1970-01-01, 2149-06-06]`. +* The [`Date`](/sql-reference/data-types/date) type should be sufficient in most cases. This type requires 2 bytes to store a date and limits the range to `[1970-01-01, 2149-06-06]`. * [`Date32`](/sql-reference/data-types/date32) covers a wider range of dates. It requires 4 bytes to store a date and limits the range to `[1900-01-01, 2299-12-31]` @@ -23,55 +23,53 @@ The range of supported types allows you to optimize both storage space and query * For cases where more precision is required, [`DateTime64`](/sql-reference/data-types/datetime64) can be used. This allows storing time with up to nanoseconds precision, with a range of `[1900-01-01 00:00:00, 2299-12-31 23:59:59.99999999]`. It requires 8 bytes per value. -Let's create a table that stores various date types: + Let's create a table that stores various date types: - -```sql -CREATE TABLE dates -( + ```sql + CREATE TABLE dates + ( `date` Date, `wider_date` Date32, `datetime` DateTime, `precise_datetime` DateTime64(3), `very_precise_datetime` DateTime64(9) -) -ENGINE = MergeTree -ORDER BY tuple(); -``` + ) + ENGINE = MergeTree + ORDER BY tuple(); + ``` -We can use the [`now()`](/sql-reference/functions/date-time-functions#now) function to return the current time and [`now64()`](/sql-reference/functions/date-time-functions#now64) to get it in a specified precision via the first argument. + We can use the [`now()`](/sql-reference/functions/date-time-functions#now) function to return the current time and [`now64()`](/sql-reference/functions/date-time-functions#now64) to get it in a specified precision via the first argument. -```sql -INSERT INTO dates -SELECT now(), + ```sql + INSERT INTO dates + SELECT now(), now()::Date32 + toIntervalYear(100), - now(), - now64(3), + now(), + now64(3), now64(9) + toIntervalYear(200); -``` + ``` -This will populate our columns with time accordingly to the column type: + This will populate our columns with time accordingly to the column type: -```sql -SELECT * FROM dates -FORMAT Vertical; -``` + ```sql + SELECT * FROM dates + FORMAT Vertical; + ``` -```text -Row 1: -────── -date: 2025-03-12 -wider_date: 2125-03-12 -datetime: 2025-03-12 11:39:07 -precise_datetime: 2025-03-12 11:39:07.196 -very_precise_datetime: 2025-03-12 11:39:07.196724000 -``` + ```text + Row 1: + ────── + date: 2025-03-12 + wider_date: 2125-03-12 + datetime: 2025-03-12 11:39:07 + precise_datetime: 2025-03-12 11:39:07.196 + very_precise_datetime: 2025-03-12 11:39:07.196724000 + ``` ## Timezones {#time-series-timezones} Many use cases require having timezones stored as well. We can set the timezone as the last argument to the `DateTime` or `DateTime64` types: - ```sql CREATE TABLE dtz ( @@ -87,16 +85,15 @@ ORDER BY id; Having defined a timezone in our DDL, we can now insert times using different timezones: - ```sql -INSERT INTO dtz -SELECT 1, +INSERT INTO dtz +SELECT 1, toDateTime('2022-12-12 12:13:14', 'America/New_York'), toDateTime('2022-12-12 12:13:14', 'America/New_York'), toDateTime64('2022-12-12 12:13:14.123456789', 9, 'America/New_York'), toDateTime64('2022-12-12 12:13:14.123456789', 9, 'America/New_York') UNION ALL -SELECT 2, +SELECT 2, toDateTime('2022-12-12 12:13:15'), toDateTime('2022-12-12 12:13:15'), toDateTime64('2022-12-12 12:13:15.123456789', 9), @@ -131,9 +128,8 @@ In the first row, we inserted all values using the `America/New_York` timezone. * `dt_1` and `dt64_1` are automatically converted to `Europe/Berlin` at query time. * `dt_2` and `dt64_2` didn't have a time zone specified, so they use the server's local time zone, which in this case is `Europe/London`. -In the second row, we inserted all the values without a timezone, so the server's local time zone was used. -As in the first row, `dt_1` and `dt_3` are converted to `Europe/Berlin`, while `dt_2` and `dt64_2` use the server's local time zone. - + In the second row, we inserted all the values without a timezone, so the server's local time zone was used. + As in the first row, `dt_1` and `dt_3` are converted to `Europe/Berlin`, while `dt_2` and `dt64_2` use the server's local time zone. ## Date and time functions {#time-series-date-time-functions} @@ -147,7 +143,7 @@ SELECT toTypeName(current_time), toDate(current_time) AS date_only, toTypeName(date_only) -FORMAT Vertical; +FORMAT Vertical; ``` ```text diff --git a/docs/use-cases/time-series/index.md b/docs/use-cases/time-series/index.md index d1a37acd28e..1976de65260 100644 --- a/docs/use-cases/time-series/index.md +++ b/docs/use-cases/time-series/index.md @@ -20,4 +20,4 @@ This guide walks you through everything you need to know about working with time * [Optimize storage efficiency for temporal data](./storage-efficiency.md) * [Tune query performance for time-series workloads](./query-performance.md) -Whether you're new to time-series analysis or looking to optimize an existing implementation, this guide will help you make the most of ClickHouse's time-series capabilities. + Whether you're new to time-series analysis or looking to optimize an existing implementation, this guide will help you make the most of ClickHouse's time-series capabilities. diff --git a/docs/use-cases/time-series/query-performance.md b/docs/use-cases/time-series/query-performance.md index 84151cbf642..a18fa1bf91e 100644 --- a/docs/use-cases/time-series/query-performance.md +++ b/docs/use-cases/time-series/query-performance.md @@ -9,14 +9,14 @@ show_related_blogs: true # Time-series query performance -After optimizing storage, the next step is improving query performance. -This section explores two key techniques: optimizing `ORDER BY` keys and using materialized views. +After optimizing storage, the next step is improving query performance. +This section explores two key techniques: optimizing `ORDER BY` keys and using materialized views. We'll see how these approaches can reduce query times from seconds to milliseconds. ## Optimize ORDER BY keys {#time-series-optimize-order-by} -Before attempting other optimizations, you should optimize their ordering key to ensure ClickHouse produces the fastest possible results. -Choosing the key right largely depends on the queries you're going to run. Suppose most of our queries filter by `project` and `subproject` columns. +Before attempting other optimizations, you should optimize their ordering key to ensure ClickHouse produces the fastest possible results. +Choosing the key right largely depends on the queries you're going to run. Suppose most of our queries filter by `project` and `subproject` columns. In this case, its a good idea to add them to the ordering key - as well as the time column since we query on time as well: Let's create another version of the table that has the same column types as `wikistat`, but is ordered by `(project, subproject, time)`. @@ -53,7 +53,7 @@ FROM wikistat GROUP BY project ORDER BY h DESC LIMIT 10; -``` +``` 2.381 sec 1.660 sec @@ -68,12 +68,12 @@ WHERE project = 'it' GROUP BY subproject ORDER BY h DESC LIMIT 10; -``` +``` 2.148 sec 0.058 sec - + ```sql @@ -83,7 +83,7 @@ WHERE (project = 'it') AND (subproject = 'zero') GROUP BY m ORDER BY m DESC LIMIT 10; -``` +``` 2.192 sec 0.012 sec @@ -98,12 +98,11 @@ WHERE (project = 'it') AND (subproject = 'zero') GROUP BY path ORDER BY h DESC LIMIT 10; -``` +``` 2.968 sec 0.010 sec - @@ -112,7 +111,6 @@ LIMIT 10; Another option is to use materialized views to aggregate and store the results of popular queries. These results can be queried instead of the original table. Suppose the following query is executed quite often in our case: - ```sql SELECT path, SUM(hits) AS v FROM wikistat @@ -156,7 +154,7 @@ ORDER BY (month, hits); ``` ```sql -CREATE MATERIALIZED VIEW wikistat_top_mv +CREATE MATERIALIZED VIEW wikistat_top_mv TO wikistat_top AS SELECT @@ -190,81 +188,76 @@ Depending on the cardinality of the raw data set (we have 1 billion rows!), this * Using an INSERT INTO SELECT query, copying all data from the raw data set into that temporary table * Dropping the temporary table and the temporary materialized view. -With that approach, rows from the raw data set are copied block-wise into the temporary table (which doesn't store any of these rows), and for each block of rows, a partial state is calculated and written to the target table, where these states are incrementally merged in the background. - + With that approach, rows from the raw data set are copied block-wise into the temporary table (which doesn't store any of these rows), and for each block of rows, a partial state is calculated and written to the target table, where these states are incrementally merged in the background. -```sql -CREATE TABLE wikistat_backfill -( + ```sql + CREATE TABLE wikistat_backfill + ( `time` DateTime, `project` String, `subproject` String, `path` String, `hits` UInt64 -) -ENGINE = Null; -``` - -Next, we'll create a materialized view to read from `wikistat_backfill` and write into `wikistat_top` + ) + ENGINE = Null; + ``` + Next, we'll create a materialized view to read from `wikistat_backfill` and write into `wikistat_top` -```sql -CREATE MATERIALIZED VIEW wikistat_backfill_top_mv -TO wikistat_top -AS -SELECT + ```sql + CREATE MATERIALIZED VIEW wikistat_backfill_top_mv + TO wikistat_top + AS + SELECT path, toStartOfMonth(time) AS month, sum(hits) AS hits -FROM wikistat_backfill -GROUP BY path, month; -``` - -And then finally, we'll populate `wikistat_backfill` from the initial `wikistat` table: - -```sql -INSERT INTO wikistat_backfill -SELECT * -FROM wikistat; -``` - -Once that query's finished, we can delete the backfill table and materialized view: - -```sql -DROP VIEW wikistat_backfill_top_mv; -DROP TABLE wikistat_backfill; -``` - -Now we can query the materialized view instead of the original table: - - -```sql -SELECT path, sum(hits) AS hits -FROM wikistat_top -WHERE month = '2015-05-01' -GROUP BY ALL -ORDER BY hits DESC -LIMIT 10; -``` - -```text -┌─path──────────────────┬─────hits─┐ -│ - │ 89543168 │ -│ Angelsberg │ 7047863 │ -│ Ana_Sayfa │ 5923985 │ -│ Academy_Awards │ 4497264 │ -│ Accueil_(homonymie) │ 2522074 │ -│ 2015_in_spaceflight │ 2050098 │ -│ Adolf_Hitler │ 1559520 │ -│ 19_Kids_and_Counting │ 813275 │ -│ Andrzej_Duda │ 796156 │ -│ 2015_Nepal_earthquake │ 726327 │ -└───────────────────────┴──────────┘ - -10 rows in set. Elapsed: 0.004 sec. -``` - - -Our performance improvement here is dramatic. -Before it took just over 2 seconds to compute the answer to this query and now it takes only 4 milliseconds. - + FROM wikistat_backfill + GROUP BY path, month; + ``` + + And then finally, we'll populate `wikistat_backfill` from the initial `wikistat` table: + + ```sql + INSERT INTO wikistat_backfill + SELECT * + FROM wikistat; + ``` + + Once that query's finished, we can delete the backfill table and materialized view: + + ```sql + DROP VIEW wikistat_backfill_top_mv; + DROP TABLE wikistat_backfill; + ``` + + Now we can query the materialized view instead of the original table: + + ```sql + SELECT path, sum(hits) AS hits + FROM wikistat_top + WHERE month = '2015-05-01' + GROUP BY ALL + ORDER BY hits DESC + LIMIT 10; + ``` + + ```text + ┌─path──────────────────┬─────hits─┐ + │ - │ 89543168 │ + │ Angelsberg │ 7047863 │ + │ Ana_Sayfa │ 5923985 │ + │ Academy_Awards │ 4497264 │ + │ Accueil_(homonymie) │ 2522074 │ + │ 2015_in_spaceflight │ 2050098 │ + │ Adolf_Hitler │ 1559520 │ + │ 19_Kids_and_Counting │ 813275 │ + │ Andrzej_Duda │ 796156 │ + │ 2015_Nepal_earthquake │ 726327 │ + └───────────────────────┴──────────┘ + + 10 rows in set. Elapsed: 0.004 sec. + ``` + + Our performance improvement here is dramatic. + Before it took just over 2 seconds to compute the answer to this query and now it takes only 4 milliseconds. diff --git a/docs/use-cases/time-series/storage-efficiency.md b/docs/use-cases/time-series/storage-efficiency.md index 27cc735c7aa..34ada2d0ec4 100644 --- a/docs/use-cases/time-series/storage-efficiency.md +++ b/docs/use-cases/time-series/storage-efficiency.md @@ -9,12 +9,12 @@ show_related_blogs: true # Time-series storage efficiency -After exploring how to query our Wikipedia statistics dataset, let's focus on optimizing its storage efficiency in ClickHouse. +After exploring how to query our Wikipedia statistics dataset, let's focus on optimizing its storage efficiency in ClickHouse. This section demonstrates practical techniques to reduce storage requirements while maintaining query performance. ## Type optimization {#time-series-type-optimization} -The general approach to optimizing storage efficiency is using optimal data types. +The general approach to optimizing storage efficiency is using optimal data types. Let's take the `project` and `subproject` columns. These columns are of type String, but have a relatively small amount of unique values: ```sql @@ -32,7 +32,6 @@ FROM wikistat; This means we can use the LowCardinality() data type, which uses dictionary-based encoding. This causes ClickHouse to store the internal value ID instead of the original string value, which in turn saves a lot of space: - ```sql ALTER TABLE wikistat MODIFY COLUMN `project` LowCardinality(String), @@ -63,7 +62,7 @@ This will reduce the size of this column in memory by at least 2 times. Note tha ## Specialized codecs {#time-series-specialized-codecs} -When we deal with sequential data, like time-series, we can further improve storage efficiency by using special codecs. +When we deal with sequential data, like time-series, we can further improve storage efficiency by using special codecs. The general idea is to store changes between values instead of absolute values themselves, which results in much less space needed when dealing with slowly changing data: ```sql @@ -71,11 +70,11 @@ ALTER TABLE wikistat MODIFY COLUMN `time` CODEC(Delta, ZSTD); ``` -We've used the Delta codec for time column, which is a good fit for time series data. +We've used the Delta codec for time column, which is a good fit for time series data. -The right ordering key can also save disk space. +The right ordering key can also save disk space. Since we usually want to filter by a path, we will add `path` to the sorting key. -This requires recreation of the table. +This requires recreation of the table. Below we can see the `CREATE` command for our initial table and the optimized table: diff --git a/docs/whats-new/changelog/2020.md b/docs/whats-new/changelog/2020.md index 4a2f7bd00a3..c64caff2772 100644 --- a/docs/whats-new/changelog/2020.md +++ b/docs/whats-new/changelog/2020.md @@ -23,7 +23,6 @@ description: 'Changelog for 2020' * Update timezones info to 2020e. [#18531](https://github.com/ClickHouse/ClickHouse/pull/18531) ([alesapin](https://github.com/alesapin)). - ### ClickHouse release v20.12.4.5-stable, 2020-12-24 {#clickhouse-release-v201245-stable-2020-12-24} #### Bug Fix {#bug-fix-1} @@ -37,7 +36,6 @@ description: 'Changelog for 2020' * Fixed possible segfault in `topK` aggregate function. This closes [#17404](https://github.com/ClickHouse/ClickHouse/issues/17404). [#17845](https://github.com/ClickHouse/ClickHouse/pull/17845) ([Maksim Kita](https://github.com/kitaisreal)). * Fixed empty `system.stack_trace` table when server is running in daemon mode. [#17630](https://github.com/ClickHouse/ClickHouse/pull/17630) ([Amos Bird](https://github.com/amosbird)). - ### ClickHouse release v20.12.3.3-stable, 2020-12-13 {#clickhouse-release-v201233-stable-2020-12-13} #### Backward Incompatible Change {#backward-incompatible-change} @@ -158,7 +156,6 @@ description: 'Changelog for 2020' * Fix UBSan report in cache dictionaries. This closes [#12641](https://github.com/ClickHouse/ClickHouse/issues/12641). [#16763](https://github.com/ClickHouse/ClickHouse/pull/16763) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Fix UBSan report when trying to convert infinite floating point number to integer. This closes [#14190](https://github.com/ClickHouse/ClickHouse/issues/14190). [#16677](https://github.com/ClickHouse/ClickHouse/pull/16677) ([alexey-milovidov](https://github.com/alexey-milovidov)). - ## ClickHouse release 20.11 {#clickhouse-release-2011} ### ClickHouse release v20.11.7.16-stable, 2021-03-02 {#clickhouse-release-v2011716-stable-2021-03-02} @@ -223,8 +220,6 @@ description: 'Changelog for 2020' * Update timezones info to 2020e. [#18531](https://github.com/ClickHouse/ClickHouse/pull/18531) ([alesapin](https://github.com/alesapin)). - - ### ClickHouse release v20.11.6.6-stable, 2020-12-24 {#clickhouse-release-v201166-stable-2020-12-24} #### Bug Fix {#bug-fix-4} @@ -270,14 +265,12 @@ description: 'Changelog for 2020' * Fixed inconsistent behavior caused by `select_sequential_consistency` for optimized trivial count query and system.tables. [#16309](https://github.com/ClickHouse/ClickHouse/pull/16309) ([Hao Chen](https://github.com/haoch)). * Throw error when use ColumnTransformer replace non exist column. [#16183](https://github.com/ClickHouse/ClickHouse/pull/16183) ([hexiaoting](https://github.com/hexiaoting)). - ### ClickHouse release v20.11.3.3-stable, 2020-11-13 {#clickhouse-release-v201133-stable-2020-11-13} #### Bug Fix {#bug-fix-5} * Fix rare silent crashes when query profiler is on and ClickHouse is installed on OS with glibc version that has (supposedly) broken asynchronous unwind tables for some functions. This fixes [#15301](https://github.com/ClickHouse/ClickHouse/issues/15301). This fixes [#13098](https://github.com/ClickHouse/ClickHouse/issues/13098). [#16846](https://github.com/ClickHouse/ClickHouse/pull/16846) ([alexey-milovidov](https://github.com/alexey-milovidov)). - ### ClickHouse release v20.11.2.1, 2020-11-11 {#clickhouse-release-v201121-2020-11-11} #### Backward Incompatible Change {#backward-incompatible-change-1} @@ -397,7 +390,6 @@ description: 'Changelog for 2020' * Simplify Sys/V init script. [#14135](https://github.com/ClickHouse/ClickHouse/pull/14135) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Added `boost::program_options` to `db_generator` in order to increase its usability. This closes [#15940](https://github.com/ClickHouse/ClickHouse/issues/15940). [#15973](https://github.com/ClickHouse/ClickHouse/pull/15973) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). - ## ClickHouse release 20.10 {#clickhouse-release-2010} ### ClickHouse release v20.10.7.4-stable, 2020-12-24 {#clickhouse-release-v201074-stable-2020-12-24} @@ -439,7 +431,6 @@ description: 'Changelog for 2020' * Fixed uncontrolled growth of `TDigest`. [#16680](https://github.com/ClickHouse/ClickHouse/pull/16680) ([hrissan](https://github.com/hrissan)). * Fixed remote query failure when using suffix `if` in Aggregate function. This fixes [#16574](https://github.com/ClickHouse/ClickHouse/issues/16574) fixes [#16231](https://github.com/ClickHouse/ClickHouse/issues/16231) [#16610](https://github.com/ClickHouse/ClickHouse/pull/16610) ([Winter Zhang](https://github.com/zhang2014)). - ### ClickHouse release v20.10.4.1-stable, 2020-11-13 {#clickhouse-release-v201041-stable-2020-11-13} #### Bug Fix {#bug-fix-8} @@ -457,7 +448,6 @@ description: 'Changelog for 2020' * Workaround for use S3 with nginx server as proxy. Nginx currenty does not accept urls with empty path like http://domain.com?delete, but vanilla aws-sdk-cpp produces this kind of urls. This commit uses patched aws-sdk-cpp version, which makes urls with "/" as path in this cases, like http://domain.com/?delete. [#16813](https://github.com/ClickHouse/ClickHouse/pull/16813) ([ianton-ru](https://github.com/ianton-ru)). - ### ClickHouse release v20.10.3.30, 2020-10-28 {#clickhouse-release-v2010330-2020-10-28} #### Backward Incompatible Change {#backward-incompatible-change-2} @@ -668,7 +658,6 @@ description: 'Changelog for 2020' * Use std::filesystem::path in ConfigProcessor for concatenating file paths. [#14558](https://github.com/ClickHouse/ClickHouse/pull/14558) ([Bharat Nallan](https://github.com/bharatnc)). * Fix debug assertion in `bitShiftLeft()` when called with negative big integer. [#14697](https://github.com/ClickHouse/ClickHouse/pull/14697) ([Artem Zuikov](https://github.com/4ertus2)). - ## ClickHouse release 20.9 {#clickhouse-release-209} ### ClickHouse release v20.9.7.11-stable, 2020-12-07 {#clickhouse-release-v209711-stable-2020-12-07} @@ -702,7 +691,6 @@ description: 'Changelog for 2020' * Update embedded timezone data to version 2020d (also update cctz to the latest master). [#17204](https://github.com/ClickHouse/ClickHouse/pull/17204) ([filimonov](https://github.com/filimonov)). - ### ClickHouse release v20.9.6.14-stable, 2020-11-20 {#clickhouse-release-v209614-stable-2020-11-20} #### Improvement {#improvement-5} @@ -724,7 +712,6 @@ description: 'Changelog for 2020' * fixes [#16574](https://github.com/ClickHouse/ClickHouse/issues/16574) fixes [#16231](https://github.com/ClickHouse/ClickHouse/issues/16231) fix remote query failure when using 'if' suffix aggregate function. [#16610](https://github.com/ClickHouse/ClickHouse/pull/16610) ([Winter Zhang](https://github.com/zhang2014)). * Query is finished faster in case of exception. Cancel execution on remote replicas if exception happens. [#15578](https://github.com/ClickHouse/ClickHouse/pull/15578) ([Azat Khuzhin](https://github.com/azat)). - ### ClickHouse release v20.9.5.5-stable, 2020-11-13 {#clickhouse-release-v20955-stable-2020-11-13} #### Bug Fix {#bug-fix-12} @@ -737,7 +724,6 @@ description: 'Changelog for 2020' * Fixed the inconsistent behaviour when a part of return data could be dropped because the set for its filtration wasn't created. [#16308](https://github.com/ClickHouse/ClickHouse/pull/16308) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). * Fix bug with MySQL database. When MySQL server used as database engine is down some queries raise Exception, because they try to get tables from disabled server, while it's unnecessary. For example, query `SELECT ... FROM system.parts` should work only with MergeTree tables and don't touch MySQL database at all. [#16032](https://github.com/ClickHouse/ClickHouse/pull/16032) ([Kruglov Pavel](https://github.com/Avogar)). - ### ClickHouse release v20.9.4.76-stable (2020-10-29) {#clickhouse-release-v209476-stable-2020-10-29} #### Bug Fix {#bug-fix-13} @@ -771,7 +757,6 @@ description: 'Changelog for 2020' * Now it's allowed to execute `ALTER ... ON CLUSTER` queries regardless of the `` setting in cluster config. [#16075](https://github.com/ClickHouse/ClickHouse/pull/16075) ([alesapin](https://github.com/alesapin)). * Unfold `{database}`, `{table}` and `{uuid}` macros in `ReplicatedMergeTree` arguments on table creation. [#16160](https://github.com/ClickHouse/ClickHouse/pull/16160) ([tavplubix](https://github.com/tavplubix)). - ### ClickHouse release v20.9.3.45-stable (2020-10-09) {#clickhouse-release-v209345-stable-2020-10-09} #### Bug Fix {#bug-fix-14} @@ -802,7 +787,6 @@ description: 'Changelog for 2020' * Now it's possible to change the type of version column for `VersionedCollapsingMergeTree` with `ALTER` query. [#15442](https://github.com/ClickHouse/ClickHouse/pull/15442) ([alesapin](https://github.com/alesapin)). - ### ClickHouse release v20.9.2.20, 2020-09-22 {#clickhouse-release-v209220-2020-09-22} #### Backward Incompatible Change {#backward-incompatible-change-3} @@ -877,8 +861,6 @@ description: 'Changelog for 2020' * Fix the logic in backport script. In previous versions it was triggered for any labels of 100% red color. It was strange. [#14433](https://github.com/ClickHouse/ClickHouse/pull/14433) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Integration tests use default base config. All config changes are explicit with main_configs, user_configs and dictionaries parameters for instance. [#13647](https://github.com/ClickHouse/ClickHouse/pull/13647) ([Ilya Yatsishin](https://github.com/qoega)). - - ## ClickHouse release 20.8 {#clickhouse-release-208} ### ClickHouse release v20.8.12.2-lts, 2021-01-16 {#clickhouse-release-v208122-lts-2021-01-16} @@ -888,7 +870,6 @@ description: 'Changelog for 2020' * Fix *If combinator with unary function and Nullable types. [#18806](https://github.com/ClickHouse/ClickHouse/pull/18806) ([Azat Khuzhin](https://github.com/azat)). * Restrict merges from wide to compact parts. In case of vertical merge it led to broken result part. [#18381](https://github.com/ClickHouse/ClickHouse/pull/18381) ([Anton Popov](https://github.com/CurtizJ)). - ### ClickHouse release v20.8.11.17-lts, 2020-12-25 {#clickhouse-release-v2081117-lts-2020-12-25} #### Bug Fix {#bug-fix-17} @@ -897,7 +878,6 @@ description: 'Changelog for 2020' * Fixed `value is too short` error when executing `toType(...)` functions (`toDate`, `toUInt32`, etc) with argument of type `Nullable(String)`. Now such functions return `NULL` on parsing errors instead of throwing exception. Fixes [#7673](https://github.com/ClickHouse/ClickHouse/issues/7673). [#18445](https://github.com/ClickHouse/ClickHouse/pull/18445) ([tavplubix](https://github.com/tavplubix)). * Fix possible crashes in aggregate functions with combinator `Distinct`, while using two-level aggregation. Fixes [#17682](https://github.com/ClickHouse/ClickHouse/issues/17682). [#18365](https://github.com/ClickHouse/ClickHouse/pull/18365) ([Anton Popov](https://github.com/CurtizJ)). - ### ClickHouse release v20.8.10.13-lts, 2020-12-24 {#clickhouse-release-v2081013-lts-2020-12-24} #### Bug Fix {#bug-fix-18} @@ -922,7 +902,6 @@ description: 'Changelog for 2020' * Fixed the issue when query optimization was producing wrong result if query contains `ARRAY JOIN`. [#17887](https://github.com/ClickHouse/ClickHouse/pull/17887) ([sundyli](https://github.com/sundy-li)). * Query is finished faster in case of exception. Cancel execution on remote replicas if exception happens. [#15578](https://github.com/ClickHouse/ClickHouse/pull/15578) ([Azat Khuzhin](https://github.com/azat)). - ### ClickHouse release v20.8.6.6-lts, 2020-11-13 {#clickhouse-release-v20866-lts-2020-11-13} #### Bug Fix {#bug-fix-19} @@ -935,7 +914,6 @@ description: 'Changelog for 2020' * Fixed the inconsistent behaviour when a part of return data could be dropped because the set for its filtration wasn't created. [#16308](https://github.com/ClickHouse/ClickHouse/pull/16308) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). * Fix bug with MySQL database. When MySQL server used as database engine is down some queries raise Exception, because they try to get tables from disabled server, while it's unnecessary. For example, query `SELECT ... FROM system.parts` should work only with MergeTree tables and don't touch MySQL database at all. [#16032](https://github.com/ClickHouse/ClickHouse/pull/16032) ([Kruglov Pavel](https://github.com/Avogar)). - ### ClickHouse release v20.8.5.45-lts, 2020-10-29 {#clickhouse-release-v208545-lts-2020-10-29} #### Bug Fix {#bug-fix-20} @@ -969,7 +947,6 @@ description: 'Changelog for 2020' * Now it's allowed to execute `ALTER ... ON CLUSTER` queries regardless of the `` setting in cluster config. [#16075](https://github.com/ClickHouse/ClickHouse/pull/16075) ([alesapin](https://github.com/alesapin)). * Unfold `{database}`, `{table}` and `{uuid}` macros in `ReplicatedMergeTree` arguments on table creation. [#16159](https://github.com/ClickHouse/ClickHouse/pull/16159) ([tavplubix](https://github.com/tavplubix)). - ### ClickHouse release v20.8.4.11-lts, 2020-10-09 {#clickhouse-release-v208411-lts-2020-10-09} #### Bug Fix {#bug-fix-21} @@ -1004,7 +981,6 @@ description: 'Changelog for 2020' * Now it's possible to change the type of version column for `VersionedCollapsingMergeTree` with `ALTER` query. [#15442](https://github.com/ClickHouse/ClickHouse/pull/15442) ([alesapin](https://github.com/alesapin)). - ### ClickHouse release v20.8.3.18-stable, 2020-09-18 {#clickhouse-release-v208318-stable-2020-09-18} #### Bug Fix {#bug-fix-22} @@ -1026,7 +1002,6 @@ description: 'Changelog for 2020' * Speed up server shutdown process if there are ongoing S3 requests. [#14496](https://github.com/ClickHouse/ClickHouse/pull/14496) ([Pavel Kovalenko](https://github.com/Jokser)). * Support custom codecs in compact parts. [#12183](https://github.com/ClickHouse/ClickHouse/pull/12183) ([Anton Popov](https://github.com/CurtizJ)). - ### ClickHouse release v20.8.2.3-stable, 2020-09-08 {#clickhouse-release-v20823-stable-2020-09-08} #### Backward Incompatible Change {#backward-incompatible-change-4} @@ -1167,7 +1142,6 @@ description: 'Changelog for 2020' * Skip PR's from robot-clickhouse. [#13489](https://github.com/ClickHouse/ClickHouse/pull/13489) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). * Move Dockerfiles from integration tests to `docker/test` directory. docker_compose files are available in `runner` docker container. Docker images are built in CI and not in integration tests. [#13448](https://github.com/ClickHouse/ClickHouse/pull/13448) ([Ilya Yatsishin](https://github.com/qoega)). - ## ClickHouse release 20.7 {#clickhouse-release-207} ### ClickHouse release v20.7.2.30-stable, 2020-08-31 {#clickhouse-release-v207230-stable-2020-08-31} @@ -1361,7 +1335,6 @@ description: 'Changelog for 2020' * Add compiler option to control that stack frames are not too large. This will help to run the code in fibers with small stack size. [#11524](https://github.com/ClickHouse/ClickHouse/pull/11524) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Update gitignore-files. [#13447](https://github.com/ClickHouse/ClickHouse/pull/13447) ([vladimir-golovchenko](https://github.com/vladimir-golovchenko)). - ## ClickHouse release 20.6 {#clickhouse-release-206} ### ClickHouse release v20.6.3.28-stable {#clickhouse-release-v206328-stable} @@ -1546,7 +1519,6 @@ description: 'Changelog for 2020' * Install `ca-certificates` before the first `apt-get update` in Dockerfile. [#12095](https://github.com/ClickHouse/ClickHouse/pull/12095) ([Ivan Blinkov](https://github.com/blinkov)). - ### ClickHouse release v20.5.2.7-stable 2020-07-02 {#clickhouse-release-v20527-stable-2020-07-02} #### Backward Incompatible Change {#backward-incompatible-change-7} @@ -1900,7 +1872,6 @@ description: 'Changelog for 2020' * Fix FreeBSD build. [#10150](https://github.com/ClickHouse/ClickHouse/pull/10150) ([Ivan](https://github.com/abyss7)). * Add new build for query tests using pytest framework. [#10039](https://github.com/ClickHouse/ClickHouse/pull/10039) ([Ivan](https://github.com/abyss7)). - ## ClickHouse release v20.4 {#clickhouse-release-v204} ### ClickHouse release v20.4.8.99-stable 2020-08-10 {#clickhouse-release-v204899-stable-2020-08-10} @@ -1976,7 +1947,6 @@ description: 'Changelog for 2020' * Install `ca-certificates` before the first `apt-get update` in Dockerfile. [#12095](https://github.com/ClickHouse/ClickHouse/pull/12095) ([Ivan Blinkov](https://github.com/blinkov)). - ### ClickHouse release v20.4.6.53-stable 2020-06-25 {#clickhouse-release-v204653-stable-2020-06-25} #### Bug Fix {#bug-fix-29} @@ -2016,7 +1986,6 @@ description: 'Changelog for 2020' * Fix several non significant errors in unit tests. [#11262](https://github.com/ClickHouse/ClickHouse/pull/11262) ([alesapin](https://github.com/alesapin)). * Fix (false) MSan report in MergeTreeIndexFullText. The issue first appeared in [#9968](https://github.com/ClickHouse/ClickHouse/issues/9968). [#10801](https://github.com/ClickHouse/ClickHouse/pull/10801) ([alexey-milovidov](https://github.com/alexey-milovidov)). - ### ClickHouse release v20.4.5.36-stable 2020-06-10 {#clickhouse-release-v204536-stable-2020-06-10} #### Bug Fix {#bug-fix-30} @@ -2381,10 +2350,8 @@ No changes compared to v20.4.3.16-stable. * Add support for `clang-tidy` in `packager` script. [#9625](https://github.com/ClickHouse/ClickHouse/pull/9625) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Add ability to use unbundled msgpack. [#10168](https://github.com/ClickHouse/ClickHouse/pull/10168) ([Azat Khuzhin](https://github.com/azat)) - ## ClickHouse release v20.3 {#clickhouse-release-v203} - ### ClickHouse release v20.3.21.2-lts, 2020-11-02 {#clickhouse-release-v203212-lts-2020-11-02} #### Bug Fix {#bug-fix-33} @@ -2393,7 +2360,6 @@ No changes compared to v20.4.3.16-stable. * Fix incorrect empty result for query from `Distributed` table if query has `WHERE`, `PREWHERE` and `GLOBAL IN`. Fixes [#15792](https://github.com/ClickHouse/ClickHouse/issues/15792). [#15933](https://github.com/ClickHouse/ClickHouse/pull/15933) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Fix missing or excessive headers in `TSV/CSVWithNames` formats. This fixes [#12504](https://github.com/ClickHouse/ClickHouse/issues/12504). [#13343](https://github.com/ClickHouse/ClickHouse/pull/13343) ([Azat Khuzhin](https://github.com/azat)). - ### ClickHouse release v20.3.20.6-lts, 2020-10-09 {#clickhouse-release-v203206-lts-2020-10-09} #### Bug Fix {#bug-fix-34} @@ -2404,7 +2370,6 @@ No changes compared to v20.4.3.16-stable. * Fix to make predicate push down work when subquery contains finalizeAggregation function. Fixes [#14847](https://github.com/ClickHouse/ClickHouse/issues/14847). [#14937](https://github.com/ClickHouse/ClickHouse/pull/14937) ([filimonov](https://github.com/filimonov)). * Concurrent `ALTER ... REPLACE/MOVE PARTITION ...` queries might cause deadlock. It's fixed. [#13626](https://github.com/ClickHouse/ClickHouse/pull/13626) ([tavplubix](https://github.com/tavplubix)). - ### ClickHouse release v20.3.19.4-lts, 2020-09-18 {#clickhouse-release-v203194-lts-2020-09-18} #### Bug Fix {#bug-fix-35} @@ -2417,7 +2382,6 @@ No changes compared to v20.4.3.16-stable. * Support custom codecs in compact parts. [#12183](https://github.com/ClickHouse/ClickHouse/pull/12183) ([Anton Popov](https://github.com/CurtizJ)). - ### ClickHouse release v20.3.18.10-lts, 2020-09-08 {#clickhouse-release-v2031810-lts-2020-09-08} #### Bug Fix {#bug-fix-36} @@ -2441,7 +2405,6 @@ No changes compared to v20.4.3.16-stable. * Fix UBSan report (adding zero to nullptr) in HashTable that appeared after migration to clang-10. [#10638](https://github.com/ClickHouse/ClickHouse/pull/10638) ([alexey-milovidov](https://github.com/alexey-milovidov)). - ### ClickHouse release v20.3.17.173-lts, 2020-08-15 {#clickhouse-release-v20317173-lts-2020-08-15} #### Bug Fix {#bug-fix-37} @@ -2451,7 +2414,6 @@ No changes compared to v20.4.3.16-stable. * Fix queries with constant columns and `ORDER BY` prefix of primary key. [#13396](https://github.com/ClickHouse/ClickHouse/pull/13396) ([Anton Popov](https://github.com/CurtizJ)). * Return passed number for numbers with MSB set in roundUpToPowerOfTwoOrZero(). [#13234](https://github.com/ClickHouse/ClickHouse/pull/13234) ([Azat Khuzhin](https://github.com/azat)). - ### ClickHouse release v20.3.16.165-lts 2020-08-10 {#clickhouse-release-v20316165-lts-2020-08-10} #### Bug Fix {#bug-fix-38} @@ -2515,7 +2477,6 @@ No changes compared to v20.4.3.16-stable. * Index not used for IN operator with literals, performance regression introduced around v19.3. This fixes [#10574](https://github.com/ClickHouse/ClickHouse/issues/10574). [#12062](https://github.com/ClickHouse/ClickHouse/pull/12062) ([nvartolomei](https://github.com/nvartolomei)). - ### ClickHouse release v20.3.12.112-lts 2020-06-25 {#clickhouse-release-v20312112-lts-2020-06-25} #### Bug Fix {#bug-fix-39} @@ -2543,7 +2504,6 @@ No changes compared to v20.4.3.16-stable. * Fix memory leak when exception is thrown in the middle of aggregation with -State functions. This fixes [#8995](https://github.com/ClickHouse/ClickHouse/issues/8995). [#11496](https://github.com/ClickHouse/ClickHouse/pull/11496) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Fix wrong results of distributed queries when alias could override qualified column name. Fixes [#9672](https://github.com/ClickHouse/ClickHouse/issues/9672) [#9714](https://github.com/ClickHouse/ClickHouse/issues/9714). [#9972](https://github.com/ClickHouse/ClickHouse/pull/9972) ([Artem Zuikov](https://github.com/4ertus2)). - ### ClickHouse release v20.3.11.97-lts 2020-06-10 {#clickhouse-release-v2031197-lts-2020-06-10} #### New Feature {#new-feature-9} @@ -2628,7 +2588,6 @@ No changes compared to v20.4.3.16-stable. * Fixed improper shutdown of `Distributed` storage. [#10491](https://github.com/ClickHouse/ClickHouse/pull/10491) ([Azat Khuzhin](https://github.com/azat)). * Fixed numeric overflow in `simpleLinearRegression` over large integers. [#10474](https://github.com/ClickHouse/ClickHouse/pull/10474) ([hcz](https://github.com/hczhcz)). - #### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-18} * Fix UBSan report in LZ4 library. [#10631](https://github.com/ClickHouse/ClickHouse/pull/10631) ([alexey-milovidov](https://github.com/alexey-milovidov)). @@ -2641,7 +2600,6 @@ No changes compared to v20.4.3.16-stable. * Fix error `the BloomFilter false positive must be a double number between 0 and 1` [#10551](https://github.com/ClickHouse/ClickHouse/issues/10551). [#10569](https://github.com/ClickHouse/ClickHouse/pull/10569) ([Winter Zhang](https://github.com/zhang2014)). - ### ClickHouse release v20.3.8.53, 2020-04-23 {#clickhouse-release-v203853-2020-04-23} #### Bug Fix {#bug-fix-43} @@ -2717,7 +2675,6 @@ No changes compared to v20.4.3.16-stable. * Fix integration test `test_settings_constraints`. [#9962](https://github.com/ClickHouse/ClickHouse/pull/9962) ([Vitaly Baranov](https://github.com/vitlibar)). * Removed dependency on `clock_getres`. [#9833](https://github.com/ClickHouse/ClickHouse/pull/9833) ([alexey-milovidov](https://github.com/alexey-milovidov)). - ### ClickHouse release v20.3.5.21, 2020-03-27 {#clickhouse-release-v203521-2020-03-27} #### Bug Fix {#bug-fix-46} @@ -2736,14 +2693,12 @@ No changes compared to v20.4.3.16-stable. * Remove order by stage from mutations because we read from a single ordered part in a single thread. Also add check that the order of rows in mutation is ordered in sorting key order and this order is not violated. [#9886](https://github.com/ClickHouse/ClickHouse/pull/9886) ([alesapin](https://github.com/alesapin)). - ### ClickHouse release v20.3.4.10, 2020-03-20 {#clickhouse-release-v203410-2020-03-20} #### Bug Fix {#bug-fix-47} * This release also contains all bug fixes from 20.1.8.41 * Fix missing `rows_before_limit_at_least` for queries over http (with processors pipeline). This fixes [#9730](https://github.com/ClickHouse/ClickHouse/issues/9730). [#9757](https://github.com/ClickHouse/ClickHouse/pull/9757) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) - ### ClickHouse release v20.3.3.6, 2020-03-17 {#clickhouse-release-v20336-2020-03-17} #### Bug Fix {#bug-fix-48} @@ -2989,7 +2944,6 @@ No changes compared to v20.4.3.16-stable. * Upgrade librdkafka to v1.3.0. Enable bundled `rdkafka` and `gsasl` libraries on Mac OS X. [#9000](https://github.com/ClickHouse/ClickHouse/pull/9000) ([Andrew Onyshchuk](https://github.com/oandrew)) * build fix on GCC 9.2.0 [#9306](https://github.com/ClickHouse/ClickHouse/pull/9306) ([vxider](https://github.com/Vxider)) - ## ClickHouse release v20.1 {#clickhouse-release-v201} ### ClickHouse release v20.1.16.120-stable 2020-60-26 {#clickhouse-release-v20116120-stable-2020-60-26} @@ -3011,21 +2965,18 @@ No changes compared to v20.4.3.16-stable. * Fix memory leak when exception is thrown in the middle of aggregation with -State functions. This fixes [#8995](https://github.com/ClickHouse/ClickHouse/issues/8995). [#11496](https://github.com/ClickHouse/ClickHouse/pull/11496) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Fix usage of primary key wrapped into a function with 'FINAL' modifier and 'ORDER BY' optimization. [#10715](https://github.com/ClickHouse/ClickHouse/pull/10715) ([Anton Popov](https://github.com/CurtizJ)). - ### ClickHouse release v20.1.15.109-stable 2020-06-19 {#clickhouse-release-v20115109-stable-2020-06-19} #### Bug Fix {#bug-fix-51} * Fix excess lock for structure during alter. [#11790](https://github.com/ClickHouse/ClickHouse/pull/11790) ([alesapin](https://github.com/alesapin)). - ### ClickHouse release v20.1.14.107-stable 2020-06-11 {#clickhouse-release-v20114107-stable-2020-06-11} #### Bug Fix {#bug-fix-52} * Fix error `Size of offsets does not match size of column` for queries with `PREWHERE column in (subquery)` and `ARRAY JOIN`. [#11580](https://github.com/ClickHouse/ClickHouse/pull/11580) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). - ### ClickHouse release v20.1.13.105-stable 2020-06-10 {#clickhouse-release-v20113105-stable-2020-06-10} #### Bug Fix {#bug-fix-53} @@ -3067,7 +3018,6 @@ No changes compared to v20.4.3.16-stable. * Fix SELECT of column ALIAS which default expression type different from column type. [#10563](https://github.com/ClickHouse/ClickHouse/pull/10563) ([Azat Khuzhin](https://github.com/azat)). * * Implemented comparison between DateTime64 and String values (just like for DateTime). [#10560](https://github.com/ClickHouse/ClickHouse/pull/10560) ([Vasily Nemkov](https://github.com/Enmk)). - ### ClickHouse release v20.1.12.86, 2020-05-26 {#clickhouse-release-v2011286-2020-05-26} #### Bug Fix {#bug-fix-54} @@ -3096,7 +3046,6 @@ No changes compared to v20.4.3.16-stable. * Added CA certificates to clickhouse-server docker image. [#10476](https://github.com/ClickHouse/ClickHouse/pull/10476) ([filimonov](https://github.com/filimonov)). - ### ClickHouse release v20.1.10.70, 2020-04-17 {#clickhouse-release-v2011070-2020-04-17} #### Bug Fix {#bug-fix-55} @@ -3175,65 +3124,64 @@ No changes compared to v20.4.3.16-stable. * Exception handling now works correctly on Windows Subsystem for Linux. See https://github.com/ClickHouse-Extras/libunwind/pull/3 This fixes [#6480](https://github.com/ClickHouse/ClickHouse/issues/6480) [#9564](https://github.com/ClickHouse/ClickHouse/pull/9564) ([sobolevsv](https://github.com/sobolevsv)) - ### ClickHouse release v20.1.6.30, 2020-03-05 {#clickhouse-release-v201630-2020-03-05} #### Bug Fix {#bug-fix-59} * Fix data incompatibility when compressed with `T64` codec. -[#9039](https://github.com/ClickHouse/ClickHouse/pull/9039) [(abyss7)](https://github.com/abyss7) + [#9039](https://github.com/ClickHouse/ClickHouse/pull/9039) [(abyss7)](https://github.com/abyss7) * Fix order of ranges while reading from MergeTree table in one thread. Fixes [#8964](https://github.com/ClickHouse/ClickHouse/issues/8964). -[#9050](https://github.com/ClickHouse/ClickHouse/pull/9050) [(CurtizJ)](https://github.com/CurtizJ) + [#9050](https://github.com/ClickHouse/ClickHouse/pull/9050) [(CurtizJ)](https://github.com/CurtizJ) * Fix possible segfault in `MergeTreeRangeReader`, while executing `PREWHERE`. Fixes [#9064](https://github.com/ClickHouse/ClickHouse/issues/9064). -[#9106](https://github.com/ClickHouse/ClickHouse/pull/9106) [(CurtizJ)](https://github.com/CurtizJ) + [#9106](https://github.com/ClickHouse/ClickHouse/pull/9106) [(CurtizJ)](https://github.com/CurtizJ) * Fix `reinterpretAsFixedString` to return `FixedString` instead of `String`. -[#9052](https://github.com/ClickHouse/ClickHouse/pull/9052) [(oandrew)](https://github.com/oandrew) + [#9052](https://github.com/ClickHouse/ClickHouse/pull/9052) [(oandrew)](https://github.com/oandrew) * Fix `joinGet` with nullable return types. Fixes [#8919](https://github.com/ClickHouse/ClickHouse/issues/8919) -[#9014](https://github.com/ClickHouse/ClickHouse/pull/9014) [(amosbird)](https://github.com/amosbird) + [#9014](https://github.com/ClickHouse/ClickHouse/pull/9014) [(amosbird)](https://github.com/amosbird) * Fix fuzz test and incorrect behaviour of bitTestAll/bitTestAny functions. -[#9143](https://github.com/ClickHouse/ClickHouse/pull/9143) [(alexey-milovidov)](https://github.com/alexey-milovidov) + [#9143](https://github.com/ClickHouse/ClickHouse/pull/9143) [(alexey-milovidov)](https://github.com/alexey-milovidov) * Fix the behaviour of match and extract functions when haystack has zero bytes. The behaviour was wrong when haystack was constant. Fixes [#9160](https://github.com/ClickHouse/ClickHouse/issues/9160) -[#9163](https://github.com/ClickHouse/ClickHouse/pull/9163) [(alexey-milovidov)](https://github.com/alexey-milovidov) + [#9163](https://github.com/ClickHouse/ClickHouse/pull/9163) [(alexey-milovidov)](https://github.com/alexey-milovidov) * Fixed execution of inversed predicates when non-strictly monotinic functional index is used. Fixes [#9034](https://github.com/ClickHouse/ClickHouse/issues/9034) -[#9223](https://github.com/ClickHouse/ClickHouse/pull/9223) [(Akazz)](https://github.com/Akazz) + [#9223](https://github.com/ClickHouse/ClickHouse/pull/9223) [(Akazz)](https://github.com/Akazz) * Allow to rewrite `CROSS` to `INNER JOIN` if there's `[NOT] LIKE` operator in `WHERE` section. Fixes [#9191](https://github.com/ClickHouse/ClickHouse/issues/9191) -[#9229](https://github.com/ClickHouse/ClickHouse/pull/9229) [(4ertus2)](https://github.com/4ertus2) + [#9229](https://github.com/ClickHouse/ClickHouse/pull/9229) [(4ertus2)](https://github.com/4ertus2) * Allow first column(s) in a table with Log engine be an alias. -[#9231](https://github.com/ClickHouse/ClickHouse/pull/9231) [(abyss7)](https://github.com/abyss7) + [#9231](https://github.com/ClickHouse/ClickHouse/pull/9231) [(abyss7)](https://github.com/abyss7) * Allow comma join with `IN()` inside. Fixes [#7314](https://github.com/ClickHouse/ClickHouse/issues/7314). -[#9251](https://github.com/ClickHouse/ClickHouse/pull/9251) [(4ertus2)](https://github.com/4ertus2) + [#9251](https://github.com/ClickHouse/ClickHouse/pull/9251) [(4ertus2)](https://github.com/4ertus2) * Improve `ALTER MODIFY/ADD` queries logic. Now you cannot `ADD` column without type, `MODIFY` default expression does not change type of column and `MODIFY` type does not loose default expression value. Fixes [#8669](https://github.com/ClickHouse/ClickHouse/issues/8669). -[#9227](https://github.com/ClickHouse/ClickHouse/pull/9227) [(alesapin)](https://github.com/alesapin) + [#9227](https://github.com/ClickHouse/ClickHouse/pull/9227) [(alesapin)](https://github.com/alesapin) * Fix mutations finalization, when already done mutation can have status is_done=0. -[#9217](https://github.com/ClickHouse/ClickHouse/pull/9217) [(alesapin)](https://github.com/alesapin) + [#9217](https://github.com/ClickHouse/ClickHouse/pull/9217) [(alesapin)](https://github.com/alesapin) * Support "Processors" pipeline for system.numbers and system.numbers_mt. This also fixes the bug when `max_execution_time` is not respected. -[#7796](https://github.com/ClickHouse/ClickHouse/pull/7796) [(KochetovNicolai)](https://github.com/KochetovNicolai) + [#7796](https://github.com/ClickHouse/ClickHouse/pull/7796) [(KochetovNicolai)](https://github.com/KochetovNicolai) * Fix wrong counting of `DictCacheKeysRequestedFound` metric. -[#9411](https://github.com/ClickHouse/ClickHouse/pull/9411) [(nikitamikhaylov)](https://github.com/nikitamikhaylov) + [#9411](https://github.com/ClickHouse/ClickHouse/pull/9411) [(nikitamikhaylov)](https://github.com/nikitamikhaylov) * Added a check for storage policy in `ATTACH PARTITION FROM`, `REPLACE PARTITION`, `MOVE TO TABLE` which otherwise could make data of part inaccessible after restart and prevent ClickHouse to start. -[#9383](https://github.com/ClickHouse/ClickHouse/pull/9383) [(excitoon)](https://github.com/excitoon) + [#9383](https://github.com/ClickHouse/ClickHouse/pull/9383) [(excitoon)](https://github.com/excitoon) * Fixed UBSan report in `MergeTreeIndexSet`. This fixes [#9250](https://github.com/ClickHouse/ClickHouse/issues/9250) -[#9365](https://github.com/ClickHouse/ClickHouse/pull/9365) [(alexey-milovidov)](https://github.com/alexey-milovidov) + [#9365](https://github.com/ClickHouse/ClickHouse/pull/9365) [(alexey-milovidov)](https://github.com/alexey-milovidov) * Fix possible datarace in BlockIO. -[#9356](https://github.com/ClickHouse/ClickHouse/pull/9356) [(KochetovNicolai)](https://github.com/KochetovNicolai) + [#9356](https://github.com/ClickHouse/ClickHouse/pull/9356) [(KochetovNicolai)](https://github.com/KochetovNicolai) * Support for `UInt64` numbers that don't fit in Int64 in JSON-related functions. Update `SIMDJSON` to master. This fixes [#9209](https://github.com/ClickHouse/ClickHouse/issues/9209) -[#9344](https://github.com/ClickHouse/ClickHouse/pull/9344) [(alexey-milovidov)](https://github.com/alexey-milovidov) + [#9344](https://github.com/ClickHouse/ClickHouse/pull/9344) [(alexey-milovidov)](https://github.com/alexey-milovidov) * Fix the issue when the amount of free space is not calculated correctly if the data directory is mounted to a separate device. For default disk calculate the free space from data subdirectory. This fixes [#7441](https://github.com/ClickHouse/ClickHouse/issues/7441) -[#9257](https://github.com/ClickHouse/ClickHouse/pull/9257) [(millb)](https://github.com/millb) + [#9257](https://github.com/ClickHouse/ClickHouse/pull/9257) [(millb)](https://github.com/millb) * Fix the issue when TLS connections may fail with the message `OpenSSL SSL_read: error:14094438:SSL routines:ssl3_read_bytes:tlsv1 alert internal error and SSL Exception: error:2400006E:random number generator::error retrieving entropy.` Update OpenSSL to upstream master. -[#8956](https://github.com/ClickHouse/ClickHouse/pull/8956) [(alexey-milovidov)](https://github.com/alexey-milovidov) + [#8956](https://github.com/ClickHouse/ClickHouse/pull/8956) [(alexey-milovidov)](https://github.com/alexey-milovidov) * When executing `CREATE` query, fold constant expressions in storage engine arguments. Replace empty database name with current database. Fixes [#6508](https://github.com/ClickHouse/ClickHouse/issues/6508), [#3492](https://github.com/ClickHouse/ClickHouse/issues/3492). Also fix check for local address in ClickHouseDictionarySource. -[#9262](https://github.com/ClickHouse/ClickHouse/pull/9262) [(tabplubix)](https://github.com/tavplubix) + [#9262](https://github.com/ClickHouse/ClickHouse/pull/9262) [(tabplubix)](https://github.com/tavplubix) * Fix segfault in `StorageMerge`, which can happen when reading from StorageFile. -[#9387](https://github.com/ClickHouse/ClickHouse/pull/9387) [(tabplubix)](https://github.com/tavplubix) + [#9387](https://github.com/ClickHouse/ClickHouse/pull/9387) [(tabplubix)](https://github.com/tavplubix) * Prevent losing data in `Kafka` in rare cases when exception happens after reading suffix but before commit. Fixes [#9378](https://github.com/ClickHouse/ClickHouse/issues/9378). Related: [#7175](https://github.com/ClickHouse/ClickHouse/issues/7175) -[#9507](https://github.com/ClickHouse/ClickHouse/pull/9507) [(filimonov)](https://github.com/filimonov) + [#9507](https://github.com/ClickHouse/ClickHouse/pull/9507) [(filimonov)](https://github.com/filimonov) * Fix bug leading to server termination when trying to use / drop `Kafka` table created with wrong parameters. Fixes [#9494](https://github.com/ClickHouse/ClickHouse/issues/9494). Incorporates [#9507](https://github.com/ClickHouse/ClickHouse/issues/9507). -[#9513](https://github.com/ClickHouse/ClickHouse/pull/9513) [(filimonov)](https://github.com/filimonov) + [#9513](https://github.com/ClickHouse/ClickHouse/pull/9513) [(filimonov)](https://github.com/filimonov) #### New Feature {#new-feature-12} * Add `deduplicate_blocks_in_dependent_materialized_views` option to control the behaviour of idempotent inserts into tables with materialized views. This new feature was added to the bugfix release by a special request from Altinity. -[#9070](https://github.com/ClickHouse/ClickHouse/pull/9070) [(urykhy)](https://github.com/urykhy) + [#9070](https://github.com/ClickHouse/ClickHouse/pull/9070) [(urykhy)](https://github.com/urykhy) ### ClickHouse release v20.1.2.4, 2020-01-22 {#clickhouse-release-v20124-2020-01-22} diff --git a/docs/whats-new/changelog/2021.md b/docs/whats-new/changelog/2021.md index 391f8e08d1f..ffe104656cf 100644 --- a/docs/whats-new/changelog/2021.md +++ b/docs/whats-new/changelog/2021.md @@ -183,7 +183,6 @@ description: 'Changelog for 2021' * Initial support for risc-v. See development/build-cross-riscv for quirks and build command that was tested. [#31309](https://github.com/ClickHouse/ClickHouse/pull/31309) ([Vladimir Smirnov](https://github.com/Civil)). * Support compile in arm machine with parameter "-DENABLE_TESTS=OFF". [#31007](https://github.com/ClickHouse/ClickHouse/pull/31007) ([zhanghuajie](https://github.com/zhanghuajieHIT)). - ### ClickHouse release v21.11, 2021-11-09 {#clickhouse-release-v2111-2021-11-09} #### Backward Incompatible Change {#backward-incompatible-change-1} @@ -345,7 +344,6 @@ description: 'Changelog for 2021' * Fix shutdown of `AccessControlManager` to fix flaky test. [#29951](https://github.com/ClickHouse/ClickHouse/pull/29951) ([Vitaly Baranov](https://github.com/vitlibar)). * Fix failed assertion in reading from `HDFS`. Update libhdfs3 library to be able to run in tests in debug. Closes [#29251](https://github.com/ClickHouse/ClickHouse/issues/29251). Closes [#27814](https://github.com/ClickHouse/ClickHouse/issues/27814). [#29276](https://github.com/ClickHouse/ClickHouse/pull/29276) ([Kseniia Sumarokova](https://github.com/kssenii)). - #### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-1} * Add support for FreeBSD builds for Aarch64 machines. [#29952](https://github.com/ClickHouse/ClickHouse/pull/29952) ([MikaelUrankar](https://github.com/MikaelUrankar)). @@ -449,7 +447,6 @@ description: 'Changelog for 2021' * Fix invalid constant type conversion when Nullable or LowCardinality primary key is used. [#28636](https://github.com/ClickHouse/ClickHouse/pull/28636) ([Amos Bird](https://github.com/amosbird)). * Fix "Column is not under aggregate function and not in GROUP BY" with PREWHERE (Fixes: [#28461](https://github.com/ClickHouse/ClickHouse/issues/28461)). [#28502](https://github.com/ClickHouse/ClickHouse/pull/28502) ([Azat Khuzhin](https://github.com/azat)). - ### ClickHouse release v21.10, 2021-10-16 {#clickhouse-release-v2110-2021-10-16} #### Backward Incompatible Change {#backward-incompatible-change-2} @@ -550,8 +547,6 @@ description: 'Changelog for 2021' * Print out git status information at CMake configure stage. [#28047](https://github.com/ClickHouse/ClickHouse/pull/28047) ([Braulio Valdivielso Martínez](https://github.com/BraulioVM)). * Temporarily switched ubuntu apt repository to mirror ru.archive.ubuntu.com as the default one (archive.ubuntu.com) is not responding from our CI. [#28016](https://github.com/ClickHouse/ClickHouse/pull/28016) ([Ilya Yatsishin](https://github.com/qoega)). - - ### ClickHouse release v21.9, 2021-09-09 {#clickhouse-release-v219-2021-09-09} #### Backward Incompatible Change {#backward-incompatible-change-3} @@ -760,7 +755,6 @@ description: 'Changelog for 2021' * Fix linking of auxiliar programs when using dynamic libraries. [#26958](https://github.com/ClickHouse/ClickHouse/pull/26958) ([Raúl Marín](https://github.com/Algunenano)). * Update RocksDB to `2021-07-16` master. [#26411](https://github.com/ClickHouse/ClickHouse/pull/26411) ([alexey-milovidov](https://github.com/alexey-milovidov)). - ### ClickHouse release v21.8, 2021-08-12 {#clickhouse-release-v218-2021-08-12} #### Upgrade Notes {#upgrade-notes} @@ -862,7 +856,6 @@ description: 'Changelog for 2021' * Fix some fuzzed msan crash. Fixes [#22517](https://github.com/ClickHouse/ClickHouse/issues/22517). [#26428](https://github.com/ClickHouse/ClickHouse/pull/26428) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Update `chown` cmd check in `clickhouse-server` docker entrypoint. It fixes error 'cluster pod restart failed (or timeout)' on Kubernetes. [#26545](https://github.com/ClickHouse/ClickHouse/pull/26545) ([Ky Li](https://github.com/Kylinrix)). - ### ClickHouse release v21.7, 2021-07-09 {#clickhouse-release-v217-2021-07-09} #### Backward Incompatible Change {#backward-incompatible-change-4} @@ -1018,7 +1011,6 @@ description: 'Changelog for 2021' * Ubuntu 20.04 is now used to run integration tests, docker-compose version used to run integration tests is updated to 1.28.2. Environment variables now take effect on docker-compose. Rework test_dictionaries_all_layouts_separate_sources to allow parallel run. [#20393](https://github.com/ClickHouse/ClickHouse/pull/20393) ([Ilya Yatsishin](https://github.com/qoega)). * Fix TOCTOU error in installation script. [#25277](https://github.com/ClickHouse/ClickHouse/pull/25277) ([alexey-milovidov](https://github.com/alexey-milovidov)). - ### ClickHouse release 21.6, 2021-06-05 {#clickhouse-release-216-2021-06-05} #### Backward Incompatible Change {#backward-incompatible-change-5} @@ -1147,7 +1139,6 @@ description: 'Changelog for 2021' * Remove a source of nondeterminism from build. Now builds at different point of time will produce byte-identical binaries. Partially addressed [#22113](https://github.com/ClickHouse/ClickHouse/issues/22113). [#23559](https://github.com/ClickHouse/ClickHouse/pull/23559) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Add simple tool for benchmarking (Zoo)Keeper. [#23038](https://github.com/ClickHouse/ClickHouse/pull/23038) ([alesapin](https://github.com/alesapin)). - ## ClickHouse release 21.5, 2021-05-20 {#clickhouse-release-215-2021-05-20} #### Backward Incompatible Change {#backward-incompatible-change-6} @@ -1286,7 +1277,6 @@ description: 'Changelog for 2021' * Avoid UB in `*Log` engines for rwlock unlock due to unlock from another thread. [#22583](https://github.com/ClickHouse/ClickHouse/pull/22583) ([Azat Khuzhin](https://github.com/azat)). * Fixed UB by unlocking the rwlock of the TinyLog from the same thread. [#22560](https://github.com/ClickHouse/ClickHouse/pull/22560) ([Azat Khuzhin](https://github.com/azat)). - ## ClickHouse release 21.4 {#clickhouse-release-214} ### ClickHouse release 21.4.1 2021-04-12 {#clickhouse-release-2141-2021-04-12} @@ -1298,11 +1288,10 @@ description: 'Changelog for 2021' * Fix `cutToFirstSignificantSubdomainCustom()`/`firstSignificantSubdomainCustom()` returning wrong result for 3+ level domains present in custom top-level domain list. For input domains matching these custom top-level domains, the third-level domain was considered to be the first significant one. This is now fixed. This change may introduce incompatibility if the function is used in e.g. the sharding key. [#21946](https://github.com/ClickHouse/ClickHouse/pull/21946) ([Azat Khuzhin](https://github.com/azat)). * Column `keys` in table `system.dictionaries` was replaced to columns `key.names` and `key.types`. Columns `key.names`, `key.types`, `attribute.names`, `attribute.types` from `system.dictionaries` table does not require dictionary to be loaded. [#21884](https://github.com/ClickHouse/ClickHouse/pull/21884) ([Maksim Kita](https://github.com/kitaisreal)). * Now replicas that are processing the `ALTER TABLE ATTACH PART[ITION]` command search in their `detached/` folders before fetching the data from other replicas. As an implementation detail, a new command `ATTACH_PART` is introduced in the replicated log. Parts are searched and compared by their checksums. [#18978](https://github.com/ClickHouse/ClickHouse/pull/18978) ([Mike Kot](https://github.com/myrrc)). **Note**: - * `ATTACH PART[ITION]` queries may not work during cluster upgrade. - * It's not possible to rollback to older ClickHouse version after executing `ALTER ... ATTACH` query in new version as the old servers would fail to pass the `ATTACH_PART` entry in the replicated log. + * `ATTACH PART[ITION]` queries may not work during cluster upgrade. + * It's not possible to rollback to older ClickHouse version after executing `ALTER ... ATTACH` query in new version as the old servers would fail to pass the `ATTACH_PART` entry in the replicated log. * In this version, empty `` will block all access to remote hosts while in previous versions it did nothing. If you want to keep old behaviour and you have empty `remote_url_allow_hosts` element in configuration file, remove it. [#20058](https://github.com/ClickHouse/ClickHouse/pull/20058) ([Vladimir Chebotarev](https://github.com/excitoon)). - #### New Feature {#new-feature-7} * Extended range of `DateTime64` to support dates from year 1925 to 2283. Improved support of `DateTime` around zero date (`1970-01-01`). [#9404](https://github.com/ClickHouse/ClickHouse/pull/9404) ([alexey-milovidov](https://github.com/alexey-milovidov), [Vasily Nemkov](https://github.com/Enmk)). Not every time and date functions are working for extended range of dates. @@ -1439,7 +1428,6 @@ description: 'Changelog for 2021' * Fix macOS shared lib build. [#20184](https://github.com/ClickHouse/ClickHouse/pull/20184) ([nvartolomei](https://github.com/nvartolomei)). * Add `ctime` option to `zookeeper-dump-tree`. It allows to dump node creation time. [#21842](https://github.com/ClickHouse/ClickHouse/pull/21842) ([Ilya](https://github.com/HumanUser)). - ## ClickHouse release 21.3 (LTS) {#clickhouse-release-213-lts} ### ClickHouse release v21.3, 2021-03-12 {#clickhouse-release-v213-2021-03-12} @@ -1595,7 +1583,6 @@ description: 'Changelog for 2021' * Fixed port clash from test_storage_kerberized_hdfs test. [#19974](https://github.com/ClickHouse/ClickHouse/pull/19974) ([Ilya Yatsishin](https://github.com/qoega)). * Print `stdout` and `stderr` to log when failed to start docker in integration tests. Before this PR there was a very short error message in this case which didn't help to investigate the problems. [#20631](https://github.com/ClickHouse/ClickHouse/pull/20631) ([Vitaly Baranov](https://github.com/vitlibar)). - ## ClickHouse release 21.2 {#clickhouse-release-212} ### ClickHouse release v21.2.2.8-stable, 2021-02-07 {#clickhouse-release-v21228-stable-2021-02-07} @@ -1722,7 +1709,6 @@ description: 'Changelog for 2021' * Fix data type convert issue for MySQL engine. [#18124](https://github.com/ClickHouse/ClickHouse/pull/18124) ([bo zeng](https://github.com/mis98zb)). * Fix clickhouse-client abort exception while executing only `select`. [#19790](https://github.com/ClickHouse/ClickHouse/pull/19790) ([taiyang-li](https://github.com/taiyang-li)). - #### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-9} * Run [SQLancer](https://twitter.com/RiggerManuel/status/1352345625480884228) (logical SQL fuzzer) in CI. [#19006](https://github.com/ClickHouse/ClickHouse/pull/19006) ([Ilya Yatsishin](https://github.com/qoega)). @@ -1740,7 +1726,6 @@ description: 'Changelog for 2021' * Fix potential nullptr dereference in table function `VALUES`. [#19357](https://github.com/ClickHouse/ClickHouse/pull/19357) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Avoid UBSan reports in `arrayElement` function, `substring` and `arraySum`. Fixes [#19305](https://github.com/ClickHouse/ClickHouse/issues/19305). Fixes [#19287](https://github.com/ClickHouse/ClickHouse/issues/19287). This closes [#19336](https://github.com/ClickHouse/ClickHouse/issues/19336). [#19347](https://github.com/ClickHouse/ClickHouse/pull/19347) ([alexey-milovidov](https://github.com/alexey-milovidov)). - ## ClickHouse release 21.1 {#clickhouse-release-211} ### ClickHouse release v21.1.3.32-stable, 2021-02-03 {#clickhouse-release-v211332-stable-2021-02-03} @@ -1771,8 +1756,6 @@ description: 'Changelog for 2021' * Disable constant folding for subqueries on the analysis stage, when the result cannot be calculated. [#18446](https://github.com/ClickHouse/ClickHouse/pull/18446) ([Azat Khuzhin](https://github.com/azat)). * Mutation might hang waiting for some non-existent part after `MOVE` or `REPLACE PARTITION` or, in rare cases, after `DETACH` or `DROP PARTITION`. It's fixed. [#15537](https://github.com/ClickHouse/ClickHouse/pull/15537) ([tavplubix](https://github.com/tavplubix)). - - ### ClickHouse release v21.1.2.15-stable 2021-01-18 {#clickhouse-release-v211215-stable-2021-01-18} #### Backward Incompatible Change {#backward-incompatible-change-10} @@ -1832,14 +1815,12 @@ description: 'Changelog for 2021' * Added `query` parameter for `clickhouse-benchmark`. [#17832](https://github.com/ClickHouse/ClickHouse/pull/17832) ([Maksim Kita](https://github.com/kitaisreal)). * `EXPLAIN AST` now support queries other then `SELECT`. [#18136](https://github.com/ClickHouse/ClickHouse/pull/18136) ([taiyang-li](https://github.com/taiyang-li)). - #### Experimental Feature {#experimental-feature-8} * Added functions for calculation of minHash and simHash of text n-grams and shingles. They are intended for semi-duplicate search. Also functions `bitHammingDistance` and `tupleHammingDistance` are added. [#7649](https://github.com/ClickHouse/ClickHouse/pull/7649) ([flynn](https://github.com/ucasFL)). * Add new data type `Map`. See [#1841](https://github.com/ClickHouse/ClickHouse/issues/1841). First version for Map only supports `String` type of key and value. [#15806](https://github.com/ClickHouse/ClickHouse/pull/15806) ([hexiaoting](https://github.com/hexiaoting)). * Implement alternative SQL parser based on ANTLR4 runtime and generated from EBNF grammar. [#11298](https://github.com/ClickHouse/ClickHouse/pull/11298) ([Ivan](https://github.com/abyss7)). - #### Performance Improvement {#performance-improvement-10} * New IP Dictionary implementation with lower memory consumption, improved performance for some cases, and fixed bugs. [#16804](https://github.com/ClickHouse/ClickHouse/pull/16804) ([vdimir](https://github.com/vdimir)). @@ -1864,7 +1845,6 @@ description: 'Changelog for 2021' * Support for async tasks in `PipelineExecutor`. Initial support of async sockets for remote queries. [#17868](https://github.com/ClickHouse/ClickHouse/pull/17868) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Allow to use `optimize_move_to_prewhere` optimization with compact parts, when sizes of columns are unknown. [#17330](https://github.com/ClickHouse/ClickHouse/pull/17330) ([Anton Popov](https://github.com/CurtizJ)). - #### Improvement {#improvement-10} * Avoid deadlock when executing INSERT SELECT into itself from a table with `TinyLog` or `Log` table engines. This closes [#6802](https://github.com/ClickHouse/ClickHouse/issues/6802). This closes [#18691](https://github.com/ClickHouse/ClickHouse/issues/18691). This closes [#16812](https://github.com/ClickHouse/ClickHouse/issues/16812). This closes [#14570](https://github.com/ClickHouse/ClickHouse/issues/14570). [#15260](https://github.com/ClickHouse/ClickHouse/pull/15260) ([alexey-milovidov](https://github.com/alexey-milovidov)). @@ -1927,7 +1907,6 @@ description: 'Changelog for 2021' * Fix never worked `fsync_part_directory`/`fsync_after_insert`/`in_memory_parts_insert_sync` (experimental feature). [#18845](https://github.com/ClickHouse/ClickHouse/pull/18845) ([Azat Khuzhin](https://github.com/azat)). * Allow using `Atomic` engine for nested database of `MaterializeMySQL` engine. [#14849](https://github.com/ClickHouse/ClickHouse/pull/14849) ([tavplubix](https://github.com/tavplubix)). - #### Bug Fix {#bug-fix-10} * Fix the issue when server can stop accepting connections in very rare cases. [#17542](https://github.com/ClickHouse/ClickHouse/pull/17542) (Amos Bird, [alexey-milovidov](https://github.com/alexey-milovidov)). @@ -2018,7 +1997,6 @@ description: 'Changelog for 2021' * Throw error when `REPLACE` column transformer operates on non existing column. [#16183](https://github.com/ClickHouse/ClickHouse/pull/16183) ([hexiaoting](https://github.com/hexiaoting)). * Throw exception in case of not equi-join ON expression in RIGH|FULL JOIN. [#15162](https://github.com/ClickHouse/ClickHouse/pull/15162) ([Artem Zuikov](https://github.com/4ertus2)). - #### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-10} * Add simple integrity check for ClickHouse binary. It allows to detect corruption due to faulty hardware (bit rot on storage media or bit flips in RAM). [#18811](https://github.com/ClickHouse/ClickHouse/pull/18811) ([alexey-milovidov](https://github.com/alexey-milovidov)). @@ -2052,5 +2030,4 @@ description: 'Changelog for 2021' * Minor improvement for path concatenation of zookeeper paths inside DDLWorker. [#17767](https://github.com/ClickHouse/ClickHouse/pull/17767) ([Bharat Nallan](https://github.com/bharatnc)). * Allow to reload symbols from debug file. This PR also fixes a build-id issue. [#17637](https://github.com/ClickHouse/ClickHouse/pull/17637) ([Amos Bird](https://github.com/amosbird)). - ## [Changelog for 2020](./2020.md) {#changelog-for-2020} diff --git a/docs/whats-new/changelog/2022.md b/docs/whats-new/changelog/2022.md index 4d7dcf4d498..303d83617fb 100644 --- a/docs/whats-new/changelog/2022.md +++ b/docs/whats-new/changelog/2022.md @@ -370,8 +370,8 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Add a test to ensure that every new table function will be documented. See [#40649](https://github.com/ClickHouse/ClickHouse/issues/40649). Rename table function `MeiliSearch` to `meilisearch`. [#40709](https://github.com/ClickHouse/ClickHouse/pull/40709) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Add a test to ensure that every new function will be documented. See [#40649](https://github.com/ClickHouse/ClickHouse/pull/40649). The functions `lemmatize`, `synonyms`, `stem` were case-insensitive by mistake. Now they are case-sensitive. [#40711](https://github.com/ClickHouse/ClickHouse/pull/40711) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * For security and stability reasons, catboost models are no longer evaluated within the ClickHouse server. Instead, the evaluation is now - done in the clickhouse-library-bridge, a separate process that loads the catboost library and communicates with the server process via - HTTP. [#40897](https://github.com/ClickHouse/ClickHouse/pull/40897) ([Robert Schulze](https://github.com/rschu1ze)). + done in the clickhouse-library-bridge, a separate process that loads the catboost library and communicates with the server process via + HTTP. [#40897](https://github.com/ClickHouse/ClickHouse/pull/40897) ([Robert Schulze](https://github.com/rschu1ze)). * Make interpretation of YAML configs to be more conventional. [#41044](https://github.com/ClickHouse/ClickHouse/pull/41044) ([Vitaly Baranov](https://github.com/vitlibar)). #### New Feature {#new-feature-3} @@ -707,7 +707,6 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * A fix for reverse DNS resolution. [#40134](https://github.com/ClickHouse/ClickHouse/pull/40134) ([Arthur Passos](https://github.com/arthurpassos)). * Fix unexpected result `arrayDifference` of `Array(UInt32). [#40211](https://github.com/ClickHouse/ClickHouse/pull/40211) ([Duc Canh Le](https://github.com/canhld94)). - ### ClickHouse release 22.7, 2022-07-21 {#a-id227a-clickhouse-release-227-2022-07-21} #### Upgrade Notes {#upgrade-notes-1} @@ -1065,7 +1064,7 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl #### Experimental Feature {#experimental-feature-6} * Implemented L1, L2, Linf, Cosine distance functions for arrays and L1, L2, Linf norm functions for arrays. - [#37033](https://github.com/ClickHouse/ClickHouse/pull/37033) ([qieqieplus](https://github.com/qieqieplus)). Caveat: the functions will be renamed. + [#37033](https://github.com/ClickHouse/ClickHouse/pull/37033) ([qieqieplus](https://github.com/qieqieplus)). Caveat: the functions will be renamed. * Improve the `WATCH` query in WindowView: 1. Reduce the latency of providing query results by calling the `fire_condition` signal. 2. Makes the cancel query operation(ctrl-c) faster, by checking `isCancelled()` more frequently. [#37226](https://github.com/ClickHouse/ClickHouse/pull/37226) ([vxider](https://github.com/Vxider)). * Introspection for remove filesystem cache. [#36802](https://github.com/ClickHouse/ClickHouse/pull/36802) ([Han Shukai](https://github.com/KinderRiven)). * Added new hash function `wyHash64` for SQL. [#36467](https://github.com/ClickHouse/ClickHouse/pull/36467) ([olevino](https://github.com/olevino)). @@ -1201,7 +1200,6 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Fix ALTER DROP COLUMN of nested column with compact parts (i.e. `ALTER TABLE x DROP COLUMN n`, when there is column `n.d`). [#35797](https://github.com/ClickHouse/ClickHouse/pull/35797) ([Azat Khuzhin](https://github.com/azat)). * Fix substring function range error length when `offset` and `length` is negative constant and `s` is not constant. [#33861](https://github.com/ClickHouse/ClickHouse/pull/33861) ([RogerYK](https://github.com/RogerYK)). - ### ClickHouse release 22.4, 2022-04-19 {#a-id224a-clickhouse-release-224-2022-04-19} #### Backward Incompatible Change {#backward-incompatible-change-5} @@ -1353,7 +1351,6 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Fix mutations in tables with enabled sparse columns. [#35284](https://github.com/ClickHouse/ClickHouse/pull/35284) ([Anton Popov](https://github.com/CurtizJ)). * Do not delay final part writing by default (fixes possible `Memory limit exceeded` during `INSERT` by adding `max_insert_delayed_streams_for_parallel_write` with default to 1000 for writes to s3 and disabled as before otherwise). [#34780](https://github.com/ClickHouse/ClickHouse/pull/34780) ([Azat Khuzhin](https://github.com/azat)). - ### ClickHouse release v22.3-lts, 2022-03-17 {#a-id223a-clickhouse-release-v223-lts-2022-03-17} #### Backward Incompatible Change {#backward-incompatible-change-6} @@ -1481,7 +1478,6 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Fix incorrect result of trivial count query when part movement feature is used [#34089](https://github.com/ClickHouse/ClickHouse/issues/34089). [#34385](https://github.com/ClickHouse/ClickHouse/pull/34385) ([nvartolomei](https://github.com/nvartolomei)). * Fix inconsistency of `max_query_size` limitation in distributed subqueries. [#34078](https://github.com/ClickHouse/ClickHouse/pull/34078) ([Chao Ma](https://github.com/godliness)). - ### ClickHouse release v22.2, 2022-02-17 {#a-id222a-clickhouse-release-v222-2022-02-17} #### Upgrade Notes {#upgrade-notes-3} @@ -1657,7 +1653,6 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Fix issue [#18206](https://github.com/ClickHouse/ClickHouse/issues/18206). [#33977](https://github.com/ClickHouse/ClickHouse/pull/33977) ([Vitaly Baranov](https://github.com/vitlibar)). * This PR allows using multiple LDAP storages in the same list of user directories. It worked earlier but was broken because LDAP tests are disabled (they are part of the testflows tests). [#33574](https://github.com/ClickHouse/ClickHouse/pull/33574) ([Vitaly Baranov](https://github.com/vitlibar)). - ### ClickHouse release v22.1, 2022-01-18 {#a-id221a-clickhouse-release-v221-2022-01-18} #### Upgrade Notes {#upgrade-notes-4} @@ -1684,7 +1679,6 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Add function `decodeURLFormComponent` slightly different to `decodeURLComponent`. Close [#10298](https://github.com/ClickHouse/ClickHouse/issues/10298). [#33451](https://github.com/ClickHouse/ClickHouse/pull/33451) ([SuperDJY](https://github.com/cmsxbc)). * Allow to split `GraphiteMergeTree` rollup rules for plain/tagged metrics (optional rule_type field). [#33494](https://github.com/ClickHouse/ClickHouse/pull/33494) ([Michail Safronov](https://github.com/msaf1980)). - #### Performance Improvement {#performance-improvement-11} * Support moving conditions to `PREWHERE` (setting `optimize_move_to_prewhere`) for tables of `Merge` engine if its all underlying tables supports `PREWHERE`. [#33300](https://github.com/ClickHouse/ClickHouse/pull/33300) ([Anton Popov](https://github.com/CurtizJ)). @@ -1700,7 +1694,6 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Optimize selecting of MergeTree parts that can be moved between volumes. [#33225](https://github.com/ClickHouse/ClickHouse/pull/33225) ([OnePiece](https://github.com/zhongyuankai)). * Fix `sparse_hashed` dict performance with sequential keys (wrong hash function). [#32536](https://github.com/ClickHouse/ClickHouse/pull/32536) ([Azat Khuzhin](https://github.com/azat)). - #### Experimental Feature {#experimental-feature-10} * Parallel reading from multiple replicas within a shard during distributed query without using sample key. To enable this, set `allow_experimental_parallel_reading_from_replicas = 1` and `max_parallel_replicas` to any number. This closes [#26748](https://github.com/ClickHouse/ClickHouse/issues/26748). [#29279](https://github.com/ClickHouse/ClickHouse/pull/29279) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). @@ -1713,7 +1706,6 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Fix ACL with explicit digit hash in `clickhouse-keeper`: now the behavior consistent with ZooKeeper and generated digest is always accepted. [#33249](https://github.com/ClickHouse/ClickHouse/pull/33249) ([小路](https://github.com/nicelulu)). [#33246](https://github.com/ClickHouse/ClickHouse/pull/33246). * Fix unexpected projection removal when detaching parts. [#32067](https://github.com/ClickHouse/ClickHouse/pull/32067) ([Amos Bird](https://github.com/amosbird)). - #### Improvement {#improvement-11} * Now date time conversion functions that generates time before `1970-01-01 00:00:00` will be saturated to zero instead of overflow. [#29953](https://github.com/ClickHouse/ClickHouse/pull/29953) ([Amos Bird](https://github.com/amosbird)). It also fixes a bug in index analysis if date truncation function would yield result before the Unix epoch. @@ -1760,7 +1752,6 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Updating `modification_time` for data part in `system.parts` after part movement [#32964](https://github.com/ClickHouse/ClickHouse/issues/32964). [#32965](https://github.com/ClickHouse/ClickHouse/pull/32965) ([save-my-heart](https://github.com/save-my-heart)). * Potential issue, cannot be exploited: integer overflow may happen in array resize. [#33024](https://github.com/ClickHouse/ClickHouse/pull/33024) ([varadarajkumar](https://github.com/varadarajkumar)). - #### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-11} * Add packages, functional tests and Docker builds for AArch64 (ARM) version of ClickHouse. [#32911](https://github.com/ClickHouse/ClickHouse/pull/32911) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). [#32415](https://github.com/ClickHouse/ClickHouse/pull/32415) @@ -1775,7 +1766,6 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Inject git information into clickhouse binary file. So we can get source code revision easily from clickhouse binary file. [#33124](https://github.com/ClickHouse/ClickHouse/pull/33124) ([taiyang-li](https://github.com/taiyang-li)). * Remove obsolete code from ConfigProcessor. Yandex specific code is not used anymore. The code contained one minor defect. This defect was reported by [Mallik Hassan](https://github.com/SadiHassan) in [#33032](https://github.com/ClickHouse/ClickHouse/issues/33032). This closes [#33032](https://github.com/ClickHouse/ClickHouse/issues/33032). [#33026](https://github.com/ClickHouse/ClickHouse/pull/33026) ([alexey-milovidov](https://github.com/alexey-milovidov)). - #### Bug Fix (user-visible misbehavior in official stable or prestable release) {#bug-fix-user-visible-misbehavior-in-official-stable-or-prestable-release-4} * Several fixes for format parsing. This is relevant if `clickhouse-server` is open for write access to adversary. Specifically crafted input data for `Native` format may lead to reading uninitialized memory or crash. This is relevant if `clickhouse-server` is open for write access to adversary. [#33050](https://github.com/ClickHouse/ClickHouse/pull/33050) ([Heena Bansal](https://github.com/HeenaBansal2009)). Fixed Apache Avro Union type index out of boundary issue in Apache Avro binary format. [#33022](https://github.com/ClickHouse/ClickHouse/pull/33022) ([Harry Lee](https://github.com/HarryLeeIBM)). Fix null pointer dereference in `LowCardinality` data when deserializing `LowCardinality` data in the Native format. [#33021](https://github.com/ClickHouse/ClickHouse/pull/33021) ([Harry Lee](https://github.com/HarryLeeIBM)). @@ -1834,5 +1824,4 @@ Refer to this issue on GitHub for more details: https://github.com/ClickHouse/Cl * Fix possible crash (or incorrect result) in case of `LowCardinality` arguments of window function. Fixes [#31114](https://github.com/ClickHouse/ClickHouse/issues/31114). [#31888](https://github.com/ClickHouse/ClickHouse/pull/31888) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Fix hang up with command `DROP TABLE system.query_log sync`. [#33293](https://github.com/ClickHouse/ClickHouse/pull/33293) ([zhanghuajie](https://github.com/zhanghuajieHIT)). - ## [Changelog for 2021](./2021.md) {#changelog-for-2021} diff --git a/docs/whats-new/changelog/2023.md b/docs/whats-new/changelog/2023.md index a83bf471a07..944e68f7948 100644 --- a/docs/whats-new/changelog/2023.md +++ b/docs/whats-new/changelog/2023.md @@ -160,7 +160,6 @@ description: 'Changelog for 2023' * Fix a slow-down of CREATE VIEW with an enormous number of subqueries [#58220](https://github.com/ClickHouse/ClickHouse/pull/58220) ([Tao Wang](https://github.com/wangtZJU)). * Fix parallel parsing for JSONCompactEachRow [#58181](https://github.com/ClickHouse/ClickHouse/pull/58181) ([Alexey Milovidov](https://github.com/alexey-milovidov)). [#58250](https://github.com/ClickHouse/ClickHouse/pull/58250) ([Kruglov Pavel](https://github.com/Avogar)). - ### ClickHouse release 23.11, 2023-12-06 {#2311} #### Backward Incompatible Change {#backward-incompatible-change-1} @@ -370,7 +369,6 @@ description: 'Changelog for 2023' * MergeTree mutations reuse source part index granularity [#57352](https://github.com/ClickHouse/ClickHouse/pull/57352) ([Maksim Kita](https://github.com/kitaisreal)). * FS cache: add a limit for background download [#57424](https://github.com/ClickHouse/ClickHouse/pull/57424) ([Kseniia Sumarokova](https://github.com/kssenii)). - ### ClickHouse release 23.10, 2023-11-02 {#2310} #### Backward Incompatible Change {#backward-incompatible-change-2} @@ -549,7 +547,6 @@ description: 'Changelog for 2023' * Fix schema cache for fallback JSON->JSONEachRow with changed settings [#56172](https://github.com/ClickHouse/ClickHouse/pull/56172) ([Kruglov Pavel](https://github.com/Avogar)). * Add error handler to odbc-bridge [#56185](https://github.com/ClickHouse/ClickHouse/pull/56185) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). - ### ClickHouse release 23.9, 2023-09-28 {#239} #### Backward Incompatible Change {#backward-incompatible-change-3} @@ -716,7 +713,6 @@ description: 'Changelog for 2023' * Fix: insert quorum w/o keeper retries [#55026](https://github.com/ClickHouse/ClickHouse/pull/55026) ([Igor Nikonov](https://github.com/devcrafter)). * Fix simple state with nullable [#55030](https://github.com/ClickHouse/ClickHouse/pull/55030) ([Pedro Riera](https://github.com/priera)). - ### ClickHouse release 23.8 LTS, 2023-08-31 {#238} #### Backward Incompatible Change {#backward-incompatible-change-4} @@ -1111,7 +1107,6 @@ description: 'Changelog for 2023' * Fix lightweight delete after drop of projection [#52517](https://github.com/ClickHouse/ClickHouse/pull/52517) ([Anton Popov](https://github.com/CurtizJ)). * Fix possible error "Cannot drain connections: cancel first" [#52585](https://github.com/ClickHouse/ClickHouse/pull/52585) ([Kruglov Pavel](https://github.com/Avogar)). - ### ClickHouse release 23.6, 2023-06-29 {#236} #### Backward Incompatible Change {#backward-incompatible-change-6} @@ -1211,7 +1206,6 @@ description: 'Changelog for 2023' * Fix fuzzer failure in ActionsDAG [#51301](https://github.com/ClickHouse/ClickHouse/pull/51301) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Remove garbage from function `transform` [#51350](https://github.com/ClickHouse/ClickHouse/pull/51350) ([Alexey Milovidov](https://github.com/alexey-milovidov)). - ### ClickHouse release 23.5, 2023-06-08 {#235} #### Upgrade Notes {#upgrade-notes} @@ -1865,7 +1859,7 @@ description: 'Changelog for 2023' * Added retries on interserver DNS errors. [#43179](https://github.com/ClickHouse/ClickHouse/pull/43179) ([Anton Kozlov](https://github.com/tonickkozlov)). * Keeper improvement: try preallocating space on the disk to avoid undefined out-of-space issues. Introduce setting `max_log_file_size` for the maximum size of Keeper's Raft log files. [#44370](https://github.com/ClickHouse/ClickHouse/pull/44370) ([Antonio Andelic](https://github.com/antonio2368)). * Optimize behavior for a replica delay api logic in case the replica is read-only. [#45148](https://github.com/ClickHouse/ClickHouse/pull/45148) ([mateng915](https://github.com/mateng0915)). -* Ask for the password in clickhouse-client interactively in a case when the empty password is wrong. Closes [#46702](https://github.com/ClickHouse/ClickHouse/issues/46702). [#46730](https://github.com/ClickHouse/ClickHouse/pull/46730) ([Nikolay Degterinsky](https://github.com/evillique)). +* Ask for the password in clickhouse-client interactively in a case when the empty password is wrong. Closes [#46702](https://github.com/ClickHouse/ClickHouse/issues/46702). [#46730](https://github.com/ClickHouse/ClickHouse/pull/46730) ([Nikolay Degterinsky](https://github.com/evillique)). * Mark `Gorilla` compression on columns of non-Float* type as suspicious. [#45376](https://github.com/ClickHouse/ClickHouse/pull/45376) ([Robert Schulze](https://github.com/rschu1ze)). * Show replica name that is executing a merge in the `postpone_reason` column. [#45458](https://github.com/ClickHouse/ClickHouse/pull/45458) ([Frank Chen](https://github.com/FrankChen021)). * Save exception stack trace in part_log. [#45459](https://github.com/ClickHouse/ClickHouse/pull/45459) ([Frank Chen](https://github.com/FrankChen021)). @@ -1916,7 +1910,6 @@ description: 'Changelog for 2023' * Support for IN clause with parameter in parameterized views. [#46583](https://github.com/ClickHouse/ClickHouse/pull/46583) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). * Do not load named collections on server startup (load them on first access instead). [#46607](https://github.com/ClickHouse/ClickHouse/pull/46607) ([Kseniia Sumarokova](https://github.com/kssenii)). - #### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-10} * Introduce GWP-ASan implemented by the LLVM runtime. This closes [#27039](https://github.com/ClickHouse/ClickHouse/issues/27039). [#45226](https://github.com/ClickHouse/ClickHouse/pull/45226) ([Han Fei](https://github.com/hanfei1991)). * We want to make our tests less stable and more flaky: add randomization for merge tree settings in tests. [#38983](https://github.com/ClickHouse/ClickHouse/pull/38983) ([Anton Popov](https://github.com/CurtizJ)). @@ -1929,7 +1922,6 @@ description: 'Changelog for 2023' * Raised the minimum Clang version needed to build ClickHouse from 12 to 15. [#46710](https://github.com/ClickHouse/ClickHouse/pull/46710) ([Robert Schulze](https://github.com/rschu1ze)). * Upgrade Intel QPL from v0.3.0 to v1.0.0 2. Build libaccel-config and link it statically to QPL library instead of dynamically. [#45809](https://github.com/ClickHouse/ClickHouse/pull/45809) ([jasperzhu](https://github.com/jinjunzh)). - #### Bug Fix (user-visible misbehavior in official stable release) {#bug-fix-user-visible-misbehavior-in-official-stable-release} * Flush data exactly by `rabbitmq_flush_interval_ms` or by `rabbitmq_max_block_size` in `StorageRabbitMQ`. Closes [#42389](https://github.com/ClickHouse/ClickHouse/issues/42389). Closes [#45160](https://github.com/ClickHouse/ClickHouse/issues/45160). [#44404](https://github.com/ClickHouse/ClickHouse/pull/44404) ([Kseniia Sumarokova](https://github.com/kssenii)). @@ -1976,7 +1968,6 @@ description: 'Changelog for 2023' * Allocated during asynchronous inserts memory buffers were deallocated in the global context and MemoryTracker counters for corresponding user and query were not updated correctly. That led to false positive OOM exceptions. [#46622](https://github.com/ClickHouse/ClickHouse/pull/46622) ([Dmitry Novik](https://github.com/novikd)). * Updated to not clear on_expression from table_join as its used by future analyze runs resolves [#45185](https://github.com/ClickHouse/ClickHouse/issues/45185). [#46487](https://github.com/ClickHouse/ClickHouse/pull/46487) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). - ### ClickHouse release 23.1, 2023-01-26 {#231} ### ClickHouse release 23.1 {#clickhouse-release-231} @@ -1984,13 +1975,12 @@ description: 'Changelog for 2023' #### Upgrade Notes {#upgrade-notes-2} * The `SYSTEM RESTART DISK` query becomes a no-op. [#44647](https://github.com/ClickHouse/ClickHouse/pull/44647) ([alesapin](https://github.com/alesapin)). * The `PREALLOCATE` option for `HASHED`/`SPARSE_HASHED` dictionaries becomes a no-op. [#45388](https://github.com/ClickHouse/ClickHouse/pull/45388) ([Azat Khuzhin](https://github.com/azat)). It does not give significant advantages anymore. -* Disallow `Gorilla` codec on columns of non-Float32 or non-Float64 type. [#45252](https://github.com/ClickHouse/ClickHouse/pull/45252) ([Robert Schulze](https://github.com/rschu1ze)). It was pointless and led to inconsistencies. +* Disallow `Gorilla` codec on columns of non-Float32 or non-Float64 type. [#45252](https://github.com/ClickHouse/ClickHouse/pull/45252) ([Robert Schulze](https://github.com/rschu1ze)). It was pointless and led to inconsistencies. * Parallel quorum inserts might work incorrectly with `*MergeTree` tables created with the deprecated syntax. Therefore, parallel quorum inserts support is completely disabled for such tables. It does not affect tables created with a new syntax. [#45430](https://github.com/ClickHouse/ClickHouse/pull/45430) ([Alexander Tokmakov](https://github.com/tavplubix)). * Use the `GetObjectAttributes` request instead of the `HeadObject` request to get the size of an object in AWS S3. This change fixes handling endpoints without explicit regions after updating the AWS SDK, for example. [#45288](https://github.com/ClickHouse/ClickHouse/pull/45288) ([Vitaly Baranov](https://github.com/vitlibar)). AWS S3 and Minio are tested, but keep in mind that various S3-compatible services (GCS, R2, B2) may have subtle incompatibilities. This change also may require you to adjust the ACL to allow the `GetObjectAttributes` request. * Forbid paths in timezone names. For example, a timezone name like `/usr/share/zoneinfo/Asia/Aden` is not allowed; the IANA timezone database name like `Asia/Aden` should be used. [#44225](https://github.com/ClickHouse/ClickHouse/pull/44225) ([Kruglov Pavel](https://github.com/Avogar)). * Queries combining equijoin and constant expressions (e.g., `JOIN ON t1.x = t2.x AND 1 = 1`) are forbidden due to incorrect results. [#44016](https://github.com/ClickHouse/ClickHouse/pull/44016) ([Vladimir C](https://github.com/vdimir)). - #### New Feature {#new-feature-11} * Dictionary source for extracting keys by traversing regular expressions tree. It can be used for User-Agent parsing. [#40878](https://github.com/ClickHouse/ClickHouse/pull/40878) ([Vage Ogannisian](https://github.com/nooblose)). [#43858](https://github.com/ClickHouse/ClickHouse/pull/43858) ([Han Fei](https://github.com/hanfei1991)). * Added parametrized view functionality, now it's possible to specify query parameters for the View table engine. resolves [#40907](https://github.com/ClickHouse/ClickHouse/issues/40907). [#41687](https://github.com/ClickHouse/ClickHouse/pull/41687) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). @@ -2008,7 +1998,7 @@ description: 'Changelog for 2023' * Support reading/writing `Nested` tables as `List` of `Struct` in `CapnProto` format. Read/write `Decimal32/64` as `Int32/64`. Closes [#43319](https://github.com/ClickHouse/ClickHouse/issues/43319). [#43379](https://github.com/ClickHouse/ClickHouse/pull/43379) ([Kruglov Pavel](https://github.com/Avogar)). * Added a `message_format_string` column to `system.text_log`. The column contains a pattern that was used to format the message. [#44543](https://github.com/ClickHouse/ClickHouse/pull/44543) ([Alexander Tokmakov](https://github.com/tavplubix)). This allows various analytics over the ClickHouse logs. * Try to autodetect headers with column names (and maybe types) for CSV/TSV/CustomSeparated input formats. -Add settings input_format_tsv/csv/custom_detect_header that enable this behaviour (enabled by default). Closes [#44640](https://github.com/ClickHouse/ClickHouse/issues/44640). [#44953](https://github.com/ClickHouse/ClickHouse/pull/44953) ([Kruglov Pavel](https://github.com/Avogar)). + Add settings input_format_tsv/csv/custom_detect_header that enable this behaviour (enabled by default). Closes [#44640](https://github.com/ClickHouse/ClickHouse/issues/44640). [#44953](https://github.com/ClickHouse/ClickHouse/pull/44953) ([Kruglov Pavel](https://github.com/Avogar)). #### Experimental Feature {#experimental-feature-7} * Add an experimental inverted index as a new secondary index type for efficient text search. [#38667](https://github.com/ClickHouse/ClickHouse/pull/38667) ([larryluogit](https://github.com/larryluogit)). diff --git a/docs/whats-new/changelog/2024.md b/docs/whats-new/changelog/2024.md index 1f32611026e..e3b57784dc1 100644 --- a/docs/whats-new/changelog/2024.md +++ b/docs/whats-new/changelog/2024.md @@ -137,7 +137,6 @@ description: 'Changelog for 2024' * Split large translation units to avoid compilation failures due to memory/cpu limitations. [#72352](https://github.com/ClickHouse/ClickHouse/pull/72352) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). * OSX: Build with ICU support, which enables collations, charset conversions and other localization features. [#73083](https://github.com/ClickHouse/ClickHouse/pull/73083) ([Raúl Marín](https://github.com/Algunenano)). - ### ClickHouse release 24.11, 2024-11-26 {#a-id2411a-clickhouse-release-2411-2024-11-26} #### Backward Incompatible Change {#backward-incompatible-change-1} @@ -451,7 +450,6 @@ description: 'Changelog for 2024' * Fix a logical error due to negative zeros in the two-level hash table. This closes [#70973](https://github.com/ClickHouse/ClickHouse/issues/70973). [#70979](https://github.com/ClickHouse/ClickHouse/pull/70979) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Fix `limit by`, `limit with ties` for distributed and parallel replicas. [#70880](https://github.com/ClickHouse/ClickHouse/pull/70880) ([Nikita Taranov](https://github.com/nickitat)). - ### ClickHouse release 24.9, 2024-09-26 {#a-id249a-clickhouse-release-249-2024-09-26} #### Backward Incompatible Change {#backward-incompatible-change-3} @@ -613,7 +611,6 @@ description: 'Changelog for 2024' * Use tryconvertfieldtotype in gethyperrectangleforrowgroup. [#69745](https://github.com/ClickHouse/ClickHouse/pull/69745) ([Miсhael Stetsyuk](https://github.com/mstetsyuk)). * Revert "Fix prewhere without columns and without adaptive index granularity (almost w/o anything)"'. Due to the reverted changes some errors might happen when reading data parts produced by old CH releases (presumably 2021 or older). [#68897](https://github.com/ClickHouse/ClickHouse/pull/68897) ([Alexander Gololobov](https://github.com/davenger)). - ### ClickHouse release 24.8 LTS, 2024-08-20 {#a-id248a-clickhouse-release-248-lts-2024-08-20} #### Backward Incompatible Change {#backward-incompatible-change-4} @@ -758,7 +755,6 @@ description: 'Changelog for 2024' * Try fix postgres crash when query is cancelled. [#68288](https://github.com/ClickHouse/ClickHouse/pull/68288) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix missing sync replica mode in query `SYSTEM SYNC REPLICA`. [#68326](https://github.com/ClickHouse/ClickHouse/pull/68326) ([Duc Canh Le](https://github.com/canhld94)). - ### ClickHouse release 24.7, 2024-07-30 {#a-id247a-clickhouse-release-247-2024-07-30} #### Backward Incompatible Change {#backward-incompatible-change-5} @@ -1244,7 +1240,6 @@ description: 'Changelog for 2024' * Ignore `text_log` for Keeper [#64218](https://github.com/ClickHouse/ClickHouse/pull/64218) ([Antonio Andelic](https://github.com/antonio2368)). * Fix Logical error: Bad cast for Buffer table with prewhere. [#64388](https://github.com/ClickHouse/ClickHouse/pull/64388) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). - ### ClickHouse release 24.4, 2024-04-30 {#a-id244a-clickhouse-release-244-2024-04-30} #### Upgrade Notes {#upgrade-notes} @@ -1405,7 +1400,6 @@ description: 'Changelog for 2024' * Set server name for SSL handshake in MongoDB engine [#63122](https://github.com/ClickHouse/ClickHouse/pull/63122) ([Alexander Gololobov](https://github.com/davenger)). * Use user specified db instead of "config" for MongoDB wire protocol version check [#63126](https://github.com/ClickHouse/ClickHouse/pull/63126) ([Alexander Gololobov](https://github.com/davenger)). - ### ClickHouse release 24.3 LTS, 2024-03-27 {#a-id243a-clickhouse-release-243-lts-2024-03-27} #### Upgrade Notes {#upgrade-notes-1} @@ -1733,7 +1727,6 @@ description: 'Changelog for 2024' * Fix OptimizeDateOrDateTimeConverterWithPreimageVisitor with null arguments [#60453](https://github.com/ClickHouse/ClickHouse/pull/60453) ([Raúl Marín](https://github.com/Algunenano)). * Fixed a minor bug that prevented distributed table queries sent from either KQL or PRQL dialect clients to be executed on replicas. [#59674](https://github.com/ClickHouse/ClickHouse/issues/59674). [#60470](https://github.com/ClickHouse/ClickHouse/pull/60470) ([Alexey Milovidov](https://github.com/alexey-milovidov)) [#59674](https://github.com/ClickHouse/ClickHouse/pull/59674) ([Austin Kothig](https://github.com/kothiga)). - ### ClickHouse release 24.1, 2024-01-30 {#a-id241a-clickhouse-release-241-2024-01-30} #### Backward Incompatible Change {#backward-incompatible-change-9} diff --git a/docs/whats-new/security-changelog.md b/docs/whats-new/security-changelog.md index 2f02469a7e5..0d1cc5a138f 100644 --- a/docs/whats-new/security-changelog.md +++ b/docs/whats-new/security-changelog.md @@ -215,4 +215,3 @@ Credits: Andrey Krasichkov and Evgeny Sidorov of Yandex Information Security Tea Incorrect configuration in deb package could lead to the unauthorized use of the database. Credits: the UK's National Cyber Security Centre (NCSC) - diff --git a/i18n/jp/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md b/i18n/jp/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md index 34d8ffad594..9bc535b4da5 100644 --- a/i18n/jp/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md +++ b/i18n/jp/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md @@ -5,7 +5,6 @@ 'description': '一般的な請求の問題のトラブルシューティング記事' --- -import trial_expired from '@site/static/images/cloud/manage/trial-expired.png'; import Image from '@theme/IdealImage'; diff --git a/i18n/ru/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md b/i18n/ru/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md index 37c0c8a6b2a..f78be2c9bc9 100644 --- a/i18n/ru/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md +++ b/i18n/ru/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md @@ -5,7 +5,6 @@ title: 'Устранение проблем с выставлением счет description: 'Статья по устранению распространённых проблем с выставлением счетов' --- -import trial_expired from '@site/static/images/cloud/manage/trial-expired.png'; import Image from '@theme/IdealImage'; diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md b/i18n/zh/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md index d6998cebe71..61cecdb68af 100644 --- a/i18n/zh/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md +++ b/i18n/zh/docusaurus-plugin-content-docs/current/cloud/manage/troubleshooting-billing-issues.md @@ -5,10 +5,8 @@ 'description': '用于常见账单问题的故障排除文章' --- -import trial_expired from '@site/static/images/cloud/manage/trial-expired.png'; import Image from '@theme/IdealImage'; - # 故障排除账单问题 ## 修复无效的支付信息 {#fixing-non-working-payment-details} @@ -21,4 +19,3 @@ import Image from '@theme/IdealImage';
    -试用已过期 diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/use-cases/observability/clickstack/deployment/helm.md b/i18n/zh/docusaurus-plugin-content-docs/current/use-cases/observability/clickstack/deployment/helm.md index 900012b94fa..f153ce8ea45 100644 --- a/i18n/zh/docusaurus-plugin-content-docs/current/use-cases/observability/clickstack/deployment/helm.md +++ b/i18n/zh/docusaurus-plugin-content-docs/current/use-cases/observability/clickstack/deployment/helm.md @@ -15,10 +15,10 @@ The helm chart for HyperDX can be found [here](https://github.com/hyperdxio/helm By default, the Helm chart provisions all core components, including: -* **ClickHouse** -* **HyperDX** -* **OpenTelemetry (OTel) collector** -* **MongoDB** (for persistent application state) +***ClickHouse** +***HyperDX** +***OpenTelemetry (OTel) collector** +***MongoDB** (for persistent application state) However, it can be easily customized to integrate with an existing ClickHouse deployment - for example, one hosted in **ClickHouse Cloud**. diff --git a/scripts/.markdownlint-cli2.yaml b/scripts/.markdownlint-cli2.yaml index 5a0ea0ce416..45e088ba72f 100644 --- a/scripts/.markdownlint-cli2.yaml +++ b/scripts/.markdownlint-cli2.yaml @@ -5,7 +5,10 @@ config: # turn settings on or off here default: false + MD007: + indent: 4 MD010: true # Disallow hard tabs + MD012: true # Multiple consecutive blank lines MD040: true # Fenced code blocks should have a language specified #MD047: true # New line on end of file links-url-type: false # Disallow relative links to a .md or .mdx file @@ -25,7 +28,15 @@ ignores: - "docs/_placeholders" - "docs/operations/settings/settings.md" # autogenerated - "docs/operations/settings/settings-formats.md" # autogenerated + - "docs/use-cases/AI_ML/MCP/index.md" # autogenerated + - "docs/chdb/guides/index.md" # autogenerated + - "docs/about-us/beta-and-experimental-features.md" # autogenerated + - "docs/getting-started/index.md" # autogenerated + - "docs/cloud/manage/jan2025_faq/index.md" # autogenerated + - "docs/cloud/reference/release-notes-index.md" # autogenerated + - "docs/_placeholders" - "docs/cloud/manage/api" + - "docs/use-cases/AI_ML/MCP/ai_agent_libraries/index.md" # autogenerated customRules: # add custom rules here - "./markdownlint/custom_rules/links_url_type.js" diff --git a/scripts/search/README.md b/scripts/search/README.md index d3e06188b6e..aeb43601a6f 100644 --- a/scripts/search/README.md +++ b/scripts/search/README.md @@ -2,7 +2,7 @@ ### Install - - Requires python 3.11 +- Requires python 3.11 ```bash pip install -r requirements.txt @@ -39,7 +39,7 @@ We use this to compute an average nDCG. ### Install - - Requires python 3.11 +- Requires python 3.11 ```bash pip install -r requirements.txt @@ -51,7 +51,6 @@ pip install -r requirements.txt python compute_ndcg.py -d ``` - ```bash usage: compute_ndcg.py [-h] [-d] [-v] [input_csv] diff --git a/src/components/CodeViewer/index.tsx b/src/components/CodeViewer/index.tsx index fd1d25c7ed8..6f7292bace9 100644 --- a/src/components/CodeViewer/index.tsx +++ b/src/components/CodeViewer/index.tsx @@ -113,7 +113,7 @@ function CodeViewer({ ): null return ( -
    +
    { header } { code_block } diff --git a/src/components/Stepper/Stepper.tsx b/src/components/Stepper/Stepper.tsx index 715e3a31550..bb69b12078a 100644 --- a/src/components/Stepper/Stepper.tsx +++ b/src/components/Stepper/Stepper.tsx @@ -124,7 +124,7 @@ const VStepper = ({ return ( {enhancedChildren} diff --git a/src/css/custom.scss b/src/css/custom.scss index 21ef6708b10..afc7478d8ca 100644 --- a/src/css/custom.scss +++ b/src/css/custom.scss @@ -1423,3 +1423,26 @@ input::-ms-input-placeholder { /* Microsoft Edge */ .DocSearch-Cancel { color: var(--docsearch-text-color) !important; } + +.custom-ul, +.custom-ol { + //padding-inline-start: 2rem; // if we want to indent the lists + margin-bottom: var(--ifm-paragraph-margin-bottom); +} + +.custom-li { + padding-inline-start: 0.5rem; +} + +.vertical-stepper .custom-ul +.vertical-stepper .custom-ol { + padding-block-end: 1.5rem; +} + +.vertical-stepper > *:last-child { + margin-bottom: 0; +} + +.code-viewer { + margin-bottom: var(--ifm-paragraph-margin-bottom); +} \ No newline at end of file diff --git a/src/theme/IdealImage/index.tsx b/src/theme/IdealImage/index.tsx index 5c3a0331eb2..e8c94769570 100644 --- a/src/theme/IdealImage/index.tsx +++ b/src/theme/IdealImage/index.tsx @@ -150,8 +150,6 @@ export default function IdealImage( ...(background ? { backgroundColor: background == "white" ? "white" : "rgb(31 31 28)" } : {}), - marginBottom: "16px", - marginTop: "16px", }}> {/* eslint-disable-next-line jsx-a11y/alt-text */}
      , + ol: (props) =>
        , + li: (props) =>
      1. , // Map to the components expected from the remark plugin Stepper: VStepper, diff --git a/src/theme/Tabs/styles.module.css b/src/theme/Tabs/styles.module.css index 979cb234231..a8ece9858fb 100644 --- a/src/theme/Tabs/styles.module.css +++ b/src/theme/Tabs/styles.module.css @@ -1,5 +1,4 @@ .tabList { - margin-bottom: var(--ifm-leading); scrollbar-width: none; border-bottom: 1px solid var(--click-color-stroke); overflow: auto; diff --git a/styles/ClickHouse/British.yml b/styles/ClickHouse/British.yml index 71143270dfd..b1ecf4ac9e9 100644 --- a/styles/ClickHouse/British.yml +++ b/styles/ClickHouse/British.yml @@ -6,7 +6,7 @@ extends: substitution message: "Use the US spelling '%s' instead of the British '%s'." link: https://docs.gitlab.com/development/documentation/styleguide/#language vocab: false -level: error +level: warning action: name: replace ignorecase: true diff --git a/styles/ClickHouse/CodeblockFences.yml b/styles/ClickHouse/CodeblockFences.yml index 363e26fc1c7..9c21a792b01 100644 --- a/styles/ClickHouse/CodeblockFences.yml +++ b/styles/ClickHouse/CodeblockFences.yml @@ -6,7 +6,7 @@ extends: existence message: "Instead of '%s' for the code block, use yaml, ruby, plaintext, markdown, javascript, shell, go, python, dockerfile, or typescript." link: https://docs.gitlab.com/development/documentation/styleguide/#code-blocks vocab: false -level: error +level: warning scope: raw raw: - '\`\`\`(yml|rb|text|md|bash|sh\n|js\n|golang\n|py\n|docker\n|ts|irb)' diff --git a/styles/ClickHouse/Exclamation.yml b/styles/ClickHouse/Exclamation.yml index b4e4a1f7e9a..4d9bb874e43 100644 --- a/styles/ClickHouse/Exclamation.yml +++ b/styles/ClickHouse/Exclamation.yml @@ -2,7 +2,7 @@ extends: existence message: "Don't use exclamation points in text." link: "https://developers.google.com/style/exclamation-points" nonword: true -level: error +level: warning action: name: edit params: diff --git a/styles/ClickHouse/HeadingPunctuation.yml b/styles/ClickHouse/HeadingPunctuation.yml index 30dad70f665..c1729868b64 100644 --- a/styles/ClickHouse/HeadingPunctuation.yml +++ b/styles/ClickHouse/HeadingPunctuation.yml @@ -2,7 +2,7 @@ extends: existence message: "Don't put a period at the end of a heading." link: "https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings" nonword: true -level: error +level: warning scope: heading action: name: edit diff --git a/styles/ClickHouse/Headings.yml b/styles/ClickHouse/Headings.yml index df83f9a0cef..c5322729bf3 100644 --- a/styles/ClickHouse/Headings.yml +++ b/styles/ClickHouse/Headings.yml @@ -1,7 +1,7 @@ extends: capitalization message: "'%s' should use sentence-style capitalization." link: "https://developers.google.com/style/capitalization#capitalization-in-titles-and-headings" -level: error +level: warning scope: heading match: $sentence indicators: diff --git a/styles/ClickHouse/Ordinal.yml b/styles/ClickHouse/Ordinal.yml index 20df000f224..98d656ca1c5 100644 --- a/styles/ClickHouse/Ordinal.yml +++ b/styles/ClickHouse/Ordinal.yml @@ -1,7 +1,7 @@ extends: existence message: "Spell out all ordinal numbers ('%s') in text." link: 'https://developers.google.com/style/numbers' -level: error +level: warning nonword: true tokens: - (?