Skip to content

polars_vs_pandas #682

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
207 changes: 207 additions & 0 deletions polars-vs-pandas/DataFrame_Plots.ipynb

Large diffs are not rendered by default.

Binary file added polars-vs-pandas/Online_Retail.parquet
Binary file not shown.
23 changes: 23 additions & 0 deletions polars-vs-pandas/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
The materials contained in this download are designed to complement the RealPython tutorial [Polars vs pandas - What's the Difference](https://realpython.com/polars-vs-pandas-difference/).

You should create a new folder named marimo on your computer and place each of these files inside it. You may also consider creating a [Python virtual environment](https://realpython.com/python-virtual-environments-a-primer/) within this folder.

Your download bundle contains the following files:

Online_Retail.parquet - This parquet file contains retail data used in some of the queries.

data_generation.py - This script contains the data_generation() function used to generate different quantities of data.

code_speed_test.py - This script performs time tests for pandas and Polars DataFrames.

dataframe_and_lazyframe_time_tests.py - This script performs time tests for DataFrames and a LazyFrame.

dataframe_and_lazyframe_time_tests_v2.py - This script performs time tests for a LazyFrame with streaming enabled.


dataframe_conversions.py - This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example.

sample_pandas_and_polars_code.py - This file contains the code used to illustrate the differences between pandas and Polars syntax.

DataFrame_Plots.ipynb - This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities.

44 changes: 44 additions & 0 deletions polars-vs-pandas/code_speed_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import sys
import time

import pandas as pd
import polars as pl
from data_generation import data_generation

# Create DataFrames

data_source = data_generation(int(sys.argv[1]))

orders_pandas = pd.DataFrame(data_source).convert_dtypes(
dtype_backend="pyarrow"
)

orders_polars = pl.DataFrame(data_source)

# pandas DataFrame Test

start_time = time.time()

orders_pandas.groupby("region")["sales_income"].sum()

end_time = time.time()

print(f"pandas Time Taken: {end_time-start_time}.")

# Polars DataFrame Test

start_time = time.time()

(
orders_polars.group_by("region").agg(
total=pl.col("sales_income").sum(),
)
)

end_time = time.time()

# Results

print(f"polars Time Taken: {end_time-start_time}.")

print(f"----- For {sys.argv[1]} rows")
17 changes: 17 additions & 0 deletions polars-vs-pandas/data_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import numpy as np


def data_generation(number_of_rows):
return {
"order_id": range(1, number_of_rows + 1),
"region": np.random.choice(
["North", "South", "East", "West"], size=number_of_rows
),
"sales_person": np.random.choice(
["Armstrong", "Aldrin", "Collins"], size=number_of_rows
),
"product": np.random.choice(
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
),
"sales_income": np.random.randint(1, 5001, size=number_of_rows),
}
82 changes: 82 additions & 0 deletions polars-vs-pandas/dataframe_and_lazyframe_time_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import sys
import time

import pandas as pd
import polars as pl
from data_generation import data_generation

# Data Generation

test_data = data_generation(int(sys.argv[1]))

# Polars DataFrame Test

overall_time_start = time.time()

polars_dataframe = pl.DataFrame(test_data)

processing_time_start = time.time()

(
polars_dataframe.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
)
)

end_time = time.time()

del polars_dataframe

print(
f"Polars DataFrame creation: {processing_time_start - overall_time_start}"
)
print(f"Polars DataFrame query runtime: {end_time - processing_time_start}")
print(f"Polars DataFrame overall time: {end_time - overall_time_start}")
print()

# Polars LazyFrame Test

overall_time_start = time.time()

polars_lazyframe = pl.LazyFrame(test_data)

processing_time_start = time.time()

(
polars_lazyframe.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
)
).collect()

end_time = time.time()

del polars_lazyframe

print(
f"Polars LazyFrame creation: {processing_time_start - overall_time_start}"
)
print(f"Polars LazyFrame query runtime: {end_time - processing_time_start}")
print(f"Polars LazyFrame overall time: {end_time - overall_time_start}")
print()

# Pandas DataFrame Test

overall_time_start = time.time()

pandas_dataframe = pd.DataFrame(test_data)

processing_time_start = time.time()

pandas_dataframe.groupby(["region", "product", "sales_person"])[
"sales_income"
].sum()

end_time = time.time()

del pandas_dataframe

print(
f"Pandas DataFrame creation: {processing_time_start - overall_time_start}"
)
print(f"Pandas DataFrame query runtime: {end_time - processing_time_start}")
print(f"Pandas DataFrame overall time: {end_time - overall_time_start}")
66 changes: 66 additions & 0 deletions polars-vs-pandas/dataframe_and_lazyframe_time_tests_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import sys
import time

import pandas as pd
import polars as pl
from data_generation import data_generation

# Data Generation

test_data = data_generation(int(sys.argv[1]))

# Polars DataFrame Test

polars_dataframe = pl.DataFrame(test_data)

(
polars_dataframe.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
)
)

# Polars LazyFrame Test

polars_lazyframe = pl.LazyFrame(test_data)

(
polars_lazyframe.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
)
).collect()


# Pandas DataFrame Test

pandas_dataframe = pd.DataFrame(test_data)

pandas_dataframe.groupby(["region", "product", "sales_person"])[
"sales_income"
].sum()

# Polars LazyFrame Streaming Test

overall_time_start = time.time()

polars_lazyframe = pl.LazyFrame(test_data)

processing_time_start = time.time()

(
polars_lazyframe.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
)
).collect(engine="streaming")

end_time = time.time()

print(
f"Polars Streaming LazyFrame creation: {processing_time_start - overall_time_start}"
)
print(
f"Polars Streaming LazyFrame query runtime: {end_time - processing_time_start}"
)
print(
f"Polars Streaming LazyFrame overall time: {end_time - overall_time_start}"
)
print()
31 changes: 31 additions & 0 deletions polars-vs-pandas/dataframe_conversions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import narwhals as nw
import polars as pl

polars_df = pl.DataFrame(
{
"a": ["a", "b", "a", "b", "c"],
"b": [1, 2, 1, 3, 3],
"c": [5, 4, 3, 2, 1],
}
)

pandas_df = polars_df.to_pandas()
type(pandas_df)

polars_df = pl.from_pandas(pandas_df)
type(polars_df)


def agnositic_groupby(df):
return (
nw.from_native(df)
.group_by("a")
.agg(nw.col("b").sum())
.sort("a")
.to_native()
)


agnositic_groupby(pandas_df)

agnositic_groupby(polars_df)
Binary file added polars-vs-pandas/git_issue.txt
Binary file not shown.
22 changes: 22 additions & 0 deletions polars-vs-pandas/sample_pandas_and_polars_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pandas as pd
import polars as pl

orders_pandas = pd.read_parquet("Online_Retail.parquet")

orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]

orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
orders_pandas["Total"] > 10
].head(3)


orders_polars = pl.read_csv("online_retail.csv")

orders_polars = pl.read_parquet("online_retail.parquet")

(
orders_polars.select(
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
total=pl.col("Quantity") * pl.col("UnitPrice"),
).filter(pl.col("total") > 10)
).head(3)