Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 214 additions & 0 deletions datafusion/functions-aggregate/src/regr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,30 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
Given input column Y and X: regr_slope(Y, X) returns the slope (k in Y = k*X + b) using minimal RSS fitting.",

"regr_slope(expression_y, expression_x)")
.with_sql_example(
r#"```sql
create table weekly_performance(day int,user_signups int) as values (1,60), (2,65), (3, 70), (4,75), (5,80);

select * from weekly_performance;
+------+---------------------+
| week | productivity_score |
| ---- | ------------------- |
| 1 | 60 |
| 2 | 65 |
| 3 | 70 |
| 4 | 75 |
| 5 | 80 |
+------+---------------------+

SELECT regr_slope(user_signups, day) AS slope FROM weekly_performance;
+--------+
| slope |
+--------+
| 5.0 |
+--------+
```
"#
)
.with_standard_argument("expression_y", Some("Dependent variable"))
.with_standard_argument("expression_x", Some("Independent variable"))
.build()
Expand All @@ -157,6 +181,31 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
this function returns b.",

"regr_intercept(expression_y, expression_x)")
.with_sql_example(
r#"```sql
create table weekly_performances(week int, productivity_score int) as values (1,60), (2,65), (3, 70), (4,75), (5,80);
select * from weekly_performances;
+------+---------------------+
| week | productivity_score |
| ---- | ------------------- |
| 1 | 60 |
| 2 | 65 |
| 3 | 70 |
| 4 | 75 |
| 5 | 80 |
+------+---------------------+

SELECT regr_intercept(productivity_score, week) AS intercept
FROM weekly_performance;
+----------+
|intercept|
|intercept |
+----------+
| 55 |
+----------+
```
"#
)
.with_standard_argument("expression_y", Some("Dependent variable"))
.with_standard_argument("expression_x", Some("Independent variable"))
.build()
Expand All @@ -169,6 +218,30 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
"Counts the number of non-null paired data points.",
"regr_count(expression_y, expression_x)",
)
.with_sql_example(
r#"```sql
create table daily_metrics(day int, user_signups int) as values (1,100), (2,120), (3, NULL), (4,110), (5,NULL);
select * from daily_metrics;
+-----+---------------+
| day | user_signups |
| --- | ------------- |
| 1 | 100 |
| 2 | 120 |
| 3 | NULL |
| 4 | 110 |
| 5 | NULL |
+-----+---------------+

SELECT regr_count(user_signups, day) AS valid_pairs
FROM daily_metrics;
+-------------+
| valid_pairs |
+-------------+
| 3 |
+-------------+
```
"#
)
.with_standard_argument("expression_y", Some("Dependent variable"))
.with_standard_argument("expression_x", Some("Independent variable"))
.build(),
Expand All @@ -181,6 +254,30 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
"Computes the square of the correlation coefficient between the independent and dependent variables.",

"regr_r2(expression_y, expression_x)")
.with_sql_example(
r#"```sql
create table weekly_performances(day int ,user_signups int) as values (1,60), (2,65), (3, 70), (4,75), (5,80);
select * from weekly_performances;
+------+---------------------+
| week | productivity_score |
| ---- | ------------------- |
| 1 | 60 |
| 2 | 65 |
| 3 | 70 |
| 4 | 75 |
| 5 | 80 |
+------+---------------------+

SELECT regr_r2(productivity_score, week) AS r_squared
FROM weekly_performance;
+---------+
|r_squared|
+---------+
| 1.0 |
+---------+
```
"#
)
.with_standard_argument("expression_y", Some("Dependent variable"))
.with_standard_argument("expression_x", Some("Independent variable"))
.build()
Expand All @@ -193,6 +290,31 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
"Computes the average of the independent variable (input) expression_x for the non-null paired data points.",

"regr_avgx(expression_y, expression_x)")
.with_sql_example(
r#"```sql
> create table daily_sales(day int, temperature int) as values (1,35), (2,36), (3, NULL), (4,37), (5,38);
> select * from daily_sales;
+-----+-------------+
| day | total_sales |
| --- | ----------- |
| 1 | 100 |
| 2 | 150 |
| 3 | 200 |
| 4 | NULL |
| 5 | 250 |
+-----+-------------+

SELECT regr_avgx(total_sales, day) AS avg_day
FROM daily_sales; --output = (1+2+3+5)/4 = 2.75
+----------+
| avg_day |
+----------+
| 2.75 |
+----------+

```
"#
)
.with_standard_argument("expression_y", Some("Dependent variable"))
.with_standard_argument("expression_x", Some("Independent variable"))
.build()
Expand All @@ -205,6 +327,30 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
"Computes the average of the dependent variable (output) expression_y for the non-null paired data points.",

"regr_avgy(expression_y, expression_x)")
.with_sql_example(
r#"```sql
create table daily_temperature(day int, temperature int) as values (1,30), (2,32), (3, NULL), (4,35), (5,36);
select * from daily_temperature;
+-----+-------------+
| day | temperature |
| --- | ----------- |
| 1 | 30 |
| 2 | 32 |
| 3 | NULL |
| 4 | 35 |
| 5 | 36 |
+-----+-------------+

SELECT regr_avgy(temperature, day) AS avg_temperature --temperature as Dependent Variable(Y)
FROM daily_temperature;
+-----------------+
| avg_temperature |
+-----------------+
| 33.25 |
+-----------------+
```
"#
)
.with_standard_argument("expression_y", Some("Dependent variable"))
.with_standard_argument("expression_x", Some("Independent variable"))
.build()
Expand All @@ -217,6 +363,30 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
"Computes the sum of squares of the independent variable.",
"regr_sxx(expression_y, expression_x)",
)
.with_sql_example(
r#"```sql
create table study_hours(int student_id, int hours,int test_score) as values(1,2,55),(2,4,65) , (3,6,75),(4,8,85),(5,10,95);
select * from study_hours;
+-------------+-----------+-----------------+
| student_id | hours (x) | test_score (y) |
| ----------- | --------- | --------------- |
| 1 | 2 | 55 |
| 2 | 4 | 65 |
| 3 | 6 | 75 |
| 4 | 8 | 85 |
| 5 | 10 | 95 |
+-------------+-----------+-----------------+

SELECT regr_sxx(test_score, hours) AS sxx
FROM study_hours; --Output - 40
+-------+
| sxx |
+-------+
| 40 |
+-------+
```
"#
)
.with_standard_argument("expression_y", Some("Dependent variable"))
.with_standard_argument("expression_x", Some("Independent variable"))
.build(),
Expand All @@ -229,6 +399,28 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
"Computes the sum of squares of the dependent variable.",
"regr_syy(expression_y, expression_x)",
)
.with_sql_example(
r#"```sql
create table employee_productivity(int week,int productivity_score) as values(1,60) , (2,65), (3,70);
select * from employee_productivity;
+-------+---------------------+
| week | Procutivity_score |
+-------+---------------------+
| 1 | 65 |
| 2 | 70 |
| 3 | 75 |
+-------+---------------------+

SELECT regr_syy(productivity_score, week) AS sum_squares_y
FROM employee_productivity;
+---------------+
| sum_squares_y |
+---------------+
| 50.0 |
+---------------+
```
"#
)
.with_standard_argument("expression_y", Some("Dependent variable"))
.with_standard_argument("expression_x", Some("Independent variable"))
.build(),
Expand All @@ -241,6 +433,28 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
"Computes the sum of products of paired data points.",
"regr_sxy(expression_y, expression_x)",
)
.with_sql_example(
r#"```sql
create table employee_productivity(int week,int productivity_score) as values(1,60) , (2,65), (3,70);
select * from employee_productivity;
+-------+---------------------+
| week | Procutivity_score |
+-------+---------------------+
| 1 | 65 |
| 2 | 70 |
| 3 | 75 |
+-------+---------------------+

SELECT regr_sxy(productivity_score, week) AS sum_product_deviations
FROM employee_productivity;
+------------------------+
| sum_product_deviations |
+------------------------+
| 10.0 |
+------------------------+
```
"#
)
.with_standard_argument("expression_y", Some("Dependent variable"))
.with_standard_argument("expression_x", Some("Independent variable"))
.build(),
Expand Down
Loading
Loading