My Small Polars Tutorial
Created at 2022-09-09 by Stefan Otte.
This is a small polars tutorial.
It covers basic polars concepts as well as some random (hopefully) useful things.
It is based on the great
polars cheat sheet
but is heavily extended and restructured.
You can find a notebook version here.
Here are some important facts that you should know when you work with polars.
polars DataFrames don't have an index. Aka no .reset_index()
every third line and less complexity due to no multi-index,
but some operartions are a bit more cumbersome in return.
There are two main concepts in polars: expressions and context.
- expression: what to do with the data without actually knowing the data, e.g.
pl.col("foo").sort().head(2)
- context: the context in which an expression is evaluated, e.g. in a
group_by
Because of the expression/context setup method chaining makes even more sense than with pandas.
These seven verbs cover most things you want to do with polars:
select # select columns (and add new ones)
with_columns # like select but keep existing columns
sort # sort rows
filter # filter rows
group_by # group dataframe
agg # aggregate groups
join # join/merge another dataframe
As always, read the friendly manual to really understand how to use polars.
IMO reading manuals is a super power and everybody can attain it. :shrug:
Setup
Install
polars[plot,pyarrow]==1.2.1
pandera[polars]==0.20.3
jupyter
jupytext
ruff
!pip install -r requirements.txt
Import
import polars as pl
# I personally like to import col as c to shorten some expressions
from polars import col as c
Basics
Creating/Reading/Saving DataFrames
# Create DataFrame
df = pl.DataFrame(
{
"nrs": [1, 2, 3, None, 5],
"names": ["foo", "ham", "spam", "egg", None],
"random": [0.3, 0.7, 0.1, 0.9, 0.6],
"groups": ["A", "A", "B", "C", "B"],
}
)
df
shape: (5, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
1 | "foo" | 0.3 | "A" |
2 | "ham" | 0.7 | "A" |
3 | "spam" | 0.1 | "B" |
null | "egg" | 0.9 | "C" |
5 | null | 0.6 | "B" |
# Save dataframes as csv
# (do yourself a favour and switch to parquet instead of CSV!)
df.write_csv("df.csv")
# Read CSV
(pl.read_csv("df.csv", columns=["nrs", "names", "random", "groups"]).equals(df))
True
# Save dataframe as parquet
df.write_parquet("df.parquet")
# Read parquet
# Note: you can also read multiple frames with wildcards
(pl.read_parquet("df*.parquet").equals(df))
True
Select columns - select()
# Select multiple columns with specific names
df.select("nrs", "names")
# equivalent
df.select(pl.col("nrs"), pl.col("names"))
df.select(pl.col.nrs, pl.col.names)
shape: (5, 2)nrs | names |
---|
i64 | str |
1 | "foo" |
2 | "ham" |
3 | "spam" |
null | "egg" |
5 | null |
df.select(pl.all().exclude("random"))
shape: (5, 3)nrs | names | groups |
---|
i64 | str | str |
1 | "foo" | "A" |
2 | "ham" | "A" |
3 | "spam" | "B" |
null | "egg" | "C" |
5 | null | "B" |
# Select columns whose name matches regular expression regex.
df.select(pl.col("^n.*$"))
shape: (5, 2)nrs | names |
---|
i64 | str |
1 | "foo" |
2 | "ham" |
3 | "spam" |
null | "egg" |
5 | null |
Add New Columns - select()
and with_columns()
df.select(NAMES=c.names)
df.select(c("names").alias("NAMES"))
shape: (5, 1)NAMES |
---|
str |
"foo" |
"ham" |
"spam" |
"egg" |
null |
# Keep existing and add new columns with `with_columns`
df.with_columns((pl.col("random") * pl.col("nrs")).alias("product"))
df.with_columns(product=pl.col("random") * pl.col("nrs"))
shape: (5, 5)nrs | names | random | groups | product |
---|
i64 | str | f64 | str | f64 |
1 | "foo" | 0.3 | "A" | 0.3 |
2 | "ham" | 0.7 | "A" | 1.4 |
3 | "spam" | 0.1 | "B" | 0.3 |
null | "egg" | 0.9 | "C" | null |
5 | null | 0.6 | "B" | 3.0 |
# Add several new columns to the DataFrame
df.with_columns(
product=(pl.col("random") * pl.col("nrs")),
names_len_bytes=pl.col("names").str.len_bytes(),
)
shape: (5, 6)nrs | names | random | groups | product | names_len_bytes |
---|
i64 | str | f64 | str | f64 | u32 |
1 | "foo" | 0.3 | "A" | 0.3 | 3 |
2 | "ham" | 0.7 | "A" | 1.4 | 3 |
3 | "spam" | 0.1 | "B" | 0.3 | 4 |
null | "egg" | 0.9 | "C" | null | 3 |
5 | null | 0.6 | "B" | 3.0 | null |
# Add a column 'index' that enumerates the rows
df.with_row_index()
shape: (5, 5)index | nrs | names | random | groups |
---|
u32 | i64 | str | f64 | str |
0 | 1 | "foo" | 0.3 | "A" |
1 | 2 | "ham" | 0.7 | "A" |
2 | 3 | "spam" | 0.1 | "B" |
3 | null | "egg" | 0.9 | "C" |
4 | 5 | null | 0.6 | "B" |
Select rows - filter()
and friends
# Filter: Extract rows that meet logical criteria.
df.filter(pl.col("random") > 0.5)
df.filter(c("random") > 0.5)
df.filter(c.random > 0.5)
shape: (3, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
2 | "ham" | 0.7 | "A" |
null | "egg" | 0.9 | "C" |
5 | null | 0.6 | "B" |
df.filter((pl.col("groups") == "B") & (pl.col("random") > 0.5))
shape: (1, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
5 | null | 0.6 | "B" |
# Randomly select fraction of rows.
df.sample(fraction=0.5)
shape: (2, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
2 | "ham" | 0.7 | "A" |
1 | "foo" | 0.3 | "A" |
# Randomly select n rows.
df.sample(n=2)
shape: (2, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
3 | "spam" | 0.1 | "B" |
null | "egg" | 0.9 | "C" |
# Select first n rows
df.head(n=2)
# Select last n rows.
df.tail(n=2)
shape: (2, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
null | "egg" | 0.9 | "C" |
5 | null | 0.6 | "B" |
Select rows and columns
# Select rows 2-4
df[2:4, :]
shape: (2, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
3 | "spam" | 0.1 | "B" |
null | "egg" | 0.9 | "C" |
# Select columns in positions 1 and 3 (first column is 0).
df[:, [1, 3]]
shape: (5, 2)names | groups |
---|
str | str |
"foo" | "A" |
"ham" | "A" |
"spam" | "B" |
"egg" | "C" |
null | "B" |
# Select rows meeting logical condition, and only the specific columns.
(df.filter(pl.col("random") > 0.5).select("names", "groups"))
shape: (3, 2)names | groups |
---|
str | str |
"ham" | "A" |
"egg" | "C" |
null | "B" |
# Select one columns as Series
print(type(df["names"]))
df["names"]
<class 'polars.series.series.Series'>
shape: (5,)names |
---|
str |
"foo" |
"ham" |
"spam" |
"egg" |
null |
Sort rows - sort()
# Order rows by values of a column (high to low)
df.sort("random", descending=True)
shape: (5, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
null | "egg" | 0.9 | "C" |
2 | "ham" | 0.7 | "A" |
5 | null | 0.6 | "B" |
1 | "foo" | 0.3 | "A" |
3 | "spam" | 0.1 | "B" |
# Order by multiple rows
df.sort("groups", "random")
shape: (5, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
1 | "foo" | 0.3 | "A" |
2 | "ham" | 0.7 | "A" |
3 | "spam" | 0.1 | "B" |
5 | null | 0.6 | "B" |
null | "egg" | 0.9 | "C" |
Summarize Data
# Tuple of # of rows, # of columns in DataFrame
df.shape
(5, 4)
# number of rows in DataFrame
len(df)
df.height
5
# number of cols in DataFrame
df.width
4
# Count number of rows with each unique value of variable
df["groups"].value_counts()
shape: (3, 2)groups | count |
---|
str | u32 |
"A" | 2 |
"C" | 1 |
"B" | 2 |
# # of distinct values in a column
df["groups"].n_unique()
3
# Basic descriptive and statistics for each column
df.describe()
shape: (9, 5)statistic | nrs | names | random | groups |
---|
str | f64 | str | f64 | str |
"count" | 4.0 | "4" | 5.0 | "5" |
"null_count" | 1.0 | "1" | 0.0 | "0" |
"mean" | 2.75 | null | 0.52 | null |
"std" | 1.707825 | null | 0.319374 | null |
"min" | 1.0 | "egg" | 0.1 | "A" |
"25%" | 2.0 | null | 0.3 | null |
"50%" | 3.0 | null | 0.6 | null |
"75%" | 3.0 | null | 0.7 | null |
"max" | 5.0 | "spam" | 0.9 | "C" |
# Aggregation functions
df.select(
# Sum values
pl.sum("random").alias("sum"),
# Minimum value
pl.min("random").alias("min"),
# Maximum value
pl.max("random").alias("max"),
# or
pl.col("random").max().alias("other_max"),
# Standard deviation
pl.std("random").alias("std_dev"),
# Variance
pl.var("random").alias("variance"),
# Median
pl.median("random").alias("median"),
# Mean
pl.mean("random").alias("mean"),
# Quantile
pl.quantile("random", 0.75).alias("quantile_0.75"),
# or
pl.col("random").quantile(0.75).alias("other_quantile_0.75"),
# First value
pl.first("random").alias("first"),
)
shape: (1, 11)sum | min | max | other_max | std_dev | variance | median | mean | quantile_0.75 | other_quantile_0.75 | first |
---|
f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
2.6 | 0.1 | 0.9 | 0.9 | 0.319374 | 0.102 | 0.6 | 0.52 | 0.7 | 0.7 | 0.3 |
Group And Aggregate Data - group_by()
and agg()
# Group by values in column named "col", returning a GroupBy object
df.group_by("groups")
<polars.dataframe.group_by.GroupBy at 0x7f674d02c990>
# All of the aggregation functions from above can be applied to a group as well
df.group_by("groups").agg(
# Sum values
pl.sum("random").alias("sum"),
# Minimum value
pl.min("random").alias("min"),
# Maximum value
pl.max("random").alias("max"),
# or
pl.col("random").max().alias("other_max"),
# Standard deviation
pl.std("random").alias("std_dev"),
# Variance
pl.var("random").alias("variance"),
# Median
pl.median("random").alias("median"),
# Mean
pl.mean("random").alias("mean"),
# Quantile
pl.quantile("random", 0.75).alias("quantile_0.75"),
# or
pl.col("random").quantile(0.75).alias("other_quantile_0.75"),
# First value
pl.first("random").alias("first"),
)
shape: (3, 12)groups | sum | min | max | other_max | std_dev | variance | median | mean | quantile_0.75 | other_quantile_0.75 | first |
---|
str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
"B" | 0.7 | 0.1 | 0.6 | 0.6 | 0.353553 | 0.125 | 0.35 | 0.35 | 0.6 | 0.6 | 0.1 |
"C" | 0.9 | 0.9 | 0.9 | 0.9 | null | null | 0.9 | 0.9 | 0.9 | 0.9 | 0.9 |
"A" | 1.0 | 0.3 | 0.7 | 0.7 | 0.282843 | 0.08 | 0.5 | 0.5 | 0.7 | 0.7 | 0.3 |
# Additional GroupBy functions
(
df.group_by("groups").agg(
# Count the number of values in each group
pl.count("random").alias("size"),
# Sample one element in each group
# (favour `map_elements` over `apply`)
pl.col("names").map_elements(
lambda group_df: group_df.sample(1).item(0), return_dtype=pl.String
),
)
)
shape: (3, 3)groups | size | names |
---|
str | u32 | str |
"A" | 2 | "foo" |
"B" | 2 | null |
"C" | 1 | "egg" |
(df.group_by("groups").agg(pl.col("names").sample(1).alias("foo")))
shape: (3, 2)groups | foo |
---|
str | list[str] |
"C" | ["egg"] |
"B" | [null] |
"A" | ["foo"] |
Reshaping Data - Changing Layout and Renaming
# Rename the columns of a DataFrame
df.rename({"nrs": "idx"})
shape: (5, 4)idx | names | random | groups |
---|
i64 | str | f64 | str |
1 | "foo" | 0.3 | "A" |
2 | "ham" | 0.7 | "A" |
3 | "spam" | 0.1 | "B" |
null | "egg" | 0.9 | "C" |
5 | null | 0.6 | "B" |
# Drop columns from DataFrame
df.drop(["names", "random"])
shape: (5, 2)nrs | groups |
---|
i64 | str |
1 | "A" |
2 | "A" |
3 | "B" |
null | "C" |
5 | "B" |
df2 = pl.DataFrame(
{
"nrs": [6],
"names": ["wow"],
"random": [0.9],
"groups": ["B"],
}
)
df3 = pl.DataFrame(
{
"primes": [2, 3, 5, 7, 11],
}
)
# Append rows of DataFrames.
pl.concat([df, df2])
shape: (6, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
1 | "foo" | 0.3 | "A" |
2 | "ham" | 0.7 | "A" |
3 | "spam" | 0.1 | "B" |
null | "egg" | 0.9 | "C" |
5 | null | 0.6 | "B" |
6 | "wow" | 0.9 | "B" |
# Append columns of DataFrames
pl.concat([df, df3], how="horizontal")
shape: (5, 5)nrs | names | random | groups | primes |
---|
i64 | str | f64 | str | i64 |
1 | "foo" | 0.3 | "A" | 2 |
2 | "ham" | 0.7 | "A" | 3 |
3 | "spam" | 0.1 | "B" | 5 |
null | "egg" | 0.9 | "C" | 7 |
5 | null | 0.6 | "B" | 11 |
# Spread rows into columns
df.pivot(values="nrs", index="groups", columns="names")
/tmp/ipykernel_20290/3836154377.py:2: DeprecationWarning: The argument `columns` for `DataFrame.pivot` is deprecated. It has been renamed to `on`.
df.pivot(values="nrs", index="groups", columns="names")
shape: (3, 6)groups | foo | ham | spam | egg | null |
---|
str | i64 | i64 | i64 | i64 | i64 |
"A" | 1 | 2 | null | null | null |
"B" | null | null | 3 | null | 5 |
"C" | null | null | null | null | null |
# Gather columns into rows (this is equivalent to the deprecated melt() function)
df.unpivot(index="nrs", on=["names", "groups"])
shape: (10, 3)nrs | variable | value |
---|
i64 | str | str |
1 | "names" | "foo" |
2 | "names" | "ham" |
3 | "names" | "spam" |
null | "names" | "egg" |
5 | "names" | null |
1 | "groups" | "A" |
2 | "groups" | "A" |
3 | "groups" | "B" |
null | "groups" | "C" |
5 | "groups" | "B" |
Reshaping Data - Joining Data Sets
df4 = pl.DataFrame(
{
"nrs": [1, 2, 5, 6],
"animals": ["cheetah", "lion", "leopard", "tiger"],
}
)
# Inner join
# Retains only rows with a match in the other set.
df.join(df4, on="nrs")
# or
df.join(df4, on="nrs", how="inner")
shape: (3, 5)nrs | names | random | groups | animals |
---|
i64 | str | f64 | str | str |
1 | "foo" | 0.3 | "A" | "cheetah" |
2 | "ham" | 0.7 | "A" | "lion" |
5 | null | 0.6 | "B" | "leopard" |
# Left join
# Retains each row from "left" set (df).
df.join(df4, on="nrs", how="left")
shape: (5, 5)nrs | names | random | groups | animals |
---|
i64 | str | f64 | str | str |
1 | "foo" | 0.3 | "A" | "cheetah" |
2 | "ham" | 0.7 | "A" | "lion" |
3 | "spam" | 0.1 | "B" | null |
null | "egg" | 0.9 | "C" | null |
5 | null | 0.6 | "B" | "leopard" |
# Outer join
# Retains each row, even if no other matching row exists.
df.join(df4, on="nrs", how="outer")
/tmp/ipykernel_20290/1617706674.py:3: DeprecationWarning: Use of `how='outer'` should be replaced with `how='full'`.
df.join(df4, on="nrs", how="outer")
shape: (6, 6)nrs | names | random | groups | nrs_right | animals |
---|
i64 | str | f64 | str | i64 | str |
1 | "foo" | 0.3 | "A" | 1 | "cheetah" |
2 | "ham" | 0.7 | "A" | 2 | "lion" |
3 | "spam" | 0.1 | "B" | null | null |
null | "egg" | 0.9 | "C" | null | null |
5 | null | 0.6 | "B" | 5 | "leopard" |
null | null | null | null | 6 | "tiger" |
# Anti join
# Contains all rows from df that do not have a match in df4.
df.join(df4, on="nrs", how="anti")
shape: (2, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
3 | "spam" | 0.1 | "B" |
null | "egg" | 0.9 | "C" |
Misc
Changing Entries That Match Certain Criteria
Use the when().then().otherwise()
chain for this.
df.with_columns(
pl.when((c("groups") == "A") & (c("names") == "foo"))
.then(9.0)
.otherwise(c.random)
.alias("random")
)
shape: (5, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
1 | "foo" | 9.0 | "A" |
2 | "ham" | 0.7 | "A" |
3 | "spam" | 0.1 | "B" |
null | "egg" | 0.9 | "C" |
5 | null | 0.6 | "B" |
Handling Missing Data
# How many nulls per column?
df.null_count()
shape: (1, 4)nrs | names | random | groups |
---|
u32 | u32 | u32 | u32 |
1 | 1 | 0 | 0 |
# Drop rows with any column having a null value
df.drop_nulls()
shape: (3, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
1 | "foo" | 0.3 | "A" |
2 | "ham" | 0.7 | "A" |
3 | "spam" | 0.1 | "B" |
# Replace null values with given value
df.fill_null(42)
shape: (5, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
1 | "foo" | 0.3 | "A" |
2 | "ham" | 0.7 | "A" |
3 | "spam" | 0.1 | "B" |
42 | "egg" | 0.9 | "C" |
5 | null | 0.6 | "B" |
# Replace null values using forward strategy
df.fill_null(strategy="forward")
# Other fill strategies are "backward", "min", "max", "mean", "zero" and "one"
shape: (5, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
1 | "foo" | 0.3 | "A" |
2 | "ham" | 0.7 | "A" |
3 | "spam" | 0.1 | "B" |
3 | "egg" | 0.9 | "C" |
5 | "egg" | 0.6 | "B" |
# Replace floating point NaN values with given value
df.fill_nan(42)
shape: (5, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
1 | "foo" | 0.3 | "A" |
2 | "ham" | 0.7 | "A" |
3 | "spam" | 0.1 | "B" |
null | "egg" | 0.9 | "C" |
5 | null | 0.6 | "B" |
Rolling Functions
# The following rolling functions are available
import numpy as np
df.select(
pl.col("random"),
# Rolling maximum value
pl.col("random").rolling_max(window_size=2).alias("rolling_max"),
# Rolling mean value
pl.col("random").rolling_mean(window_size=2).alias("rolling_mean"),
# Rolling median value
pl.col("random")
.rolling_median(window_size=2, min_periods=2)
.alias("rolling_median"),
# Rolling minimum value
pl.col("random").rolling_min(window_size=2).alias("rolling_min"),
# Rolling standard deviation
pl.col("random").rolling_std(window_size=2).alias("rolling_std"),
# Rolling sum values
pl.col("random").rolling_sum(window_size=2).alias("rolling_sum"),
# Rolling variance
pl.col("random").rolling_var(window_size=2).alias("rolling_var"),
# Rolling quantile
pl.col("random")
.rolling_quantile(quantile=0.75, window_size=2, min_periods=2)
.alias("rolling_quantile"),
# Rolling skew
pl.col("random").rolling_skew(window_size=2).alias("rolling_skew"),
# Rolling custom function
pl.col("random")
.rolling_map(function=np.nanstd, window_size=2)
.alias("rolling_apply"),
)
shape: (5, 11)random | rolling_max | rolling_mean | rolling_median | rolling_min | rolling_std | rolling_sum | rolling_var | rolling_quantile | rolling_skew | rolling_apply |
---|
f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
0.3 | null | null | null | null | null | null | null | null | null | null |
0.7 | 0.7 | 0.5 | 0.5 | 0.3 | 0.282843 | 1.0 | 0.08 | 0.7 | -4.3368e-16 | 0.2 |
0.1 | 0.7 | 0.4 | 0.4 | 0.1 | 0.424264 | 0.8 | 0.18 | 0.7 | 3.8549e-16 | 0.3 |
0.9 | 0.9 | 0.5 | 0.5 | 0.1 | 0.565685 | 1.0 | 0.32 | 0.9 | 0.0 | 0.4 |
0.6 | 0.9 | 0.75 | 0.75 | 0.6 | 0.212132 | 1.5 | 0.045 | 0.9 | 0.0 | 0.15 |
Window Functions
# Window functions allow to group by several columns simultaneously
df.select(
"names",
"groups",
"random",
pl.col("random").sum().over("names").alias("sum_by_names"),
pl.col("random").sum().over("groups").alias("sum_by_groups"),
)
shape: (5, 5)names | groups | random | sum_by_names | sum_by_groups |
---|
str | str | f64 | f64 | f64 |
"foo" | "A" | 0.3 | 0.3 | 1.0 |
"ham" | "A" | 0.7 | 0.7 | 1.0 |
"spam" | "B" | 0.1 | 0.1 | 0.7 |
"egg" | "C" | 0.9 | 0.9 | 0.9 |
null | "B" | 0.6 | 0.6 | 0.7 |
Date Range Creation
# create data with pl.date_range
from datetime import date
df_date = pl.DataFrame(
{
# eager=True is important to turn the expression into actual data
"date": pl.date_range(
date(2024, 1, 1), date(2024, 1, 7), interval="1d", eager=True
),
"value": [1, 2, 3, 4, 5, 6, 7],
}
)
df_date
shape: (7, 2)date | value |
---|
date | i64 |
2024-01-01 | 1 |
2024-01-02 | 2 |
2024-01-03 | 3 |
2024-01-04 | 4 |
2024-01-05 | 5 |
2024-01-06 | 6 |
2024-01-07 | 7 |
Time-based Upsampling - group_by_dynamic()
group_by_dynamic
has many useful options.
[docs]
(df_date.group_by_dynamic("date", every="1w").agg(c.value.sum()))
shape: (1, 2)date | value |
---|
date | i64 |
2024-01-01 | 28 |
Custom Expressions
You can create your own expression and reuse them throughout your projects.
def normalize_str_col(col_name: str) -> pl.Expr:
return pl.col(col_name).str.to_lowercase().str.replace_all(" ", "_")
df.select(new_col=normalize_str_col(col_name="groups"))
shape: (5, 1)new_col |
---|
str |
"a" |
"a" |
"b" |
"c" |
"b" |
Plotting
Polars does not implement plotting itself, but delegates to hvplot
.
The plot functions are available via the .plot
accessor.
[docs]
shape: (5, 4)nrs | names | random | groups |
---|
i64 | str | f64 | str |
1 | "foo" | 0.3 | "A" |
2 | "ham" | 0.7 | "A" |
3 | "spam" | 0.1 | "B" |
null | "egg" | 0.9 | "C" |
5 | null | 0.6 | "B" |
(
df.group_by("groups", maintain_order=True)
.agg(pl.sum("random"))
.plot.bar(x="groups", y="random")
)
%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)
Debugging
If you have a long chain of transformations, it can be handy to look at / log intemediate steps.
Write little helpers to make this easy and call them via pipe()
.
def log_df(df: pl.DataFrame, prefix="") -> pl.DataFrame:
print(f"{prefix}shape:{df.shape} schema: {dict(df.schema)}")
return df
(
df.pipe(log_df, "step 1: ")
.filter(c.random > 0.5)
.pipe(log_df, "step 2: ")
.select("names")
.pipe(log_df, "step 3: ")
)
step 1: shape:(5, 4) schema: {'nrs': Int64, 'names': String, 'random': Float64, 'groups': String}
step 2: shape:(3, 4) schema: {'nrs': Int64, 'names': String, 'random': Float64, 'groups': String}
step 3: shape:(3, 1) schema: {'names': String}
Eager, lazy, out-of-core
Lazy mode allows optimization of the query plan.
Out-of-core or streaming allows to work with data that is bigger than the RAM.
# eager loading, lazy execution
data = pl.read_parquet("df.parquet")
(
data.lazy()
.group_by("groups")
.agg(pl.sum("random").alias("total"))
.sort("total")
# till here nothing really happened
.collect() # now we execute the plan and collect the results
)
shape: (3, 2)groups | total |
---|
str | f64 |
"B" | 0.7 |
"C" | 0.9 |
"A" | 1.0 |
# lazy loading, lazy execution
data = pl.scan_parquet("df.parquet")
(
data.lazy()
.group_by("groups")
.agg(pl.sum("random").alias("total"))
.sort("total")
# till here nothing really happened
# with the next line, we execute the plan and collect the results
.collect()
)
shape: (3, 2)groups | total |
---|
str | f64 |
"B" | 0.7 |
"C" | 0.9 |
"A" | 1.0 |
# stream data
data = pl.scan_parquet("df.parquet")
(
data.lazy()
.group_by("groups")
.agg(pl.sum("random").alias("total"))
.sort("total")
# till here nothing really happened
# with the next line, we execute the plan in a streaming fashion
.collect(streaming=True)
)
shape: (3, 2)groups | total |
---|
str | f64 |
"B" | 0.7 |
"C" | 0.9 |
"A" | 1.0 |
Data Validation with Pandera
Since 0.19 pandera offers polars support.
That means you can validate the schema and data of your polars DataFrame.
This is just a sneak peak, read the docs for more.
import pandera.polars as pa
# define your schema in as much detail as you want
class MySchema(pa.DataFrameModel):
nrs: int
names: str # or pl.String
# different range
random: float = pa.Field(in_range={"min_value": 1.0, "max_value": 2.0})
# C is not allowed
groups: str = pa.Field(isin=["A", "B"])
class Config:
# All existing columns must be listed in the schema
strict = True
# Then validate it.
# Use lazy=True to run all validations before throwing the SchemaErrors
try:
MySchema.validate(df, lazy=True)
except pa.errors.SchemaErrors as e:
print("Got SchemaErrors exception.")
print(e)
Got SchemaErrors exception.
{
"SCHEMA": {
"SERIES_CONTAINS_NULLS": [
{
"schema": "MySchema",
"column": "nrs",
"check": "not_nullable",
"error": "non-nullable column 'nrs' contains null values"
},
{
"schema": "MySchema",
"column": "names",
"check": "not_nullable",
"error": "non-nullable column 'names' contains null values"
}
]
},
"DATA": {
"DATAFRAME_CHECK": [
{
"schema": "MySchema",
"column": "random",
"check": "in_range(1.0, 2.0)",
"error": "Column 'random' failed validator number 0: <Check in_range: in_range(1.0, 2.0)> failure case examples: [{'random': 0.3}, {'random': 0.7}, {'random': 0.1}, {'random': 0.9}, {'random': 0.6}]"
},
{
"schema": "MySchema",
"column": "groups",
"check": "isin(['A', 'B'])",
"error": "Column 'groups' failed validator number 0: <Check isin: isin(['A', 'B'])> failure case examples: [{'groups': 'C'}]"
}
]
}
}