My Small Polars Tutorial

Created at 2022-09-09 by Stefan Otte.

This is a small polars tutorial. It covers basic polars concepts as well as some random (hopefully) useful things.

It is based on the great polars cheat sheet but is heavily extended and restructured. You can find a notebook version here.

Here are some important facts that you should know when you work with polars.

polars DataFrames don't have an index. Aka no .reset_index() every third line and less complexity due to no multi-index, but some operartions are a bit more cumbersome in return.

There are two main concepts in polars: expressions and context.

  • expression: what to do with the data without actually knowing the data, e.g. pl.col("foo").sort().head(2)
  • context: the context in which an expression is evaluated, e.g. in a group_by

Because of the expression/context setup method chaining makes even more sense than with pandas.

These seven verbs cover most things you want to do with polars:

select        # select columns (and add new ones)
with_columns  # like select but keep existing columns
sort          # sort rows
filter        # filter rows
group_by      # group dataframe
agg           # aggregate groups
join          # join/merge another dataframe

As always, read the friendly manual to really understand how to use polars.

IMO reading manuals is a super power and everybody can attain it. :shrug:

Setup

Install

!cat requirements.txt
polars[plot,pyarrow]==1.2.1
pandera[polars]==0.20.3
jupyter
jupytext
ruff
!pip install -r requirements.txt

Import

import polars as pl

# I personally like to import col as c to shorten some expressions
from polars import col as c

Basics

Creating/Reading/Saving DataFrames

# Create DataFrame
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": [0.3, 0.7, 0.1, 0.9, 0.6],
        "groups": ["A", "A", "B", "C", "B"],
    }
)
df
shape: (5, 4)
nrsnamesrandomgroups
i64strf64str
1"foo"0.3"A"
2"ham"0.7"A"
3"spam"0.1"B"
null"egg"0.9"C"
5null0.6"B"
# Save dataframes as csv
# (do yourself a favour and switch to parquet instead of CSV!)
df.write_csv("df.csv")
# Read CSV
(pl.read_csv("df.csv", columns=["nrs", "names", "random", "groups"]).equals(df))
True
# Save dataframe as parquet
df.write_parquet("df.parquet")
# Read parquet
# Note: you can also read multiple frames with wildcards
(pl.read_parquet("df*.parquet").equals(df))
True

Select columns - select()

# Select multiple columns with specific names
df.select("nrs", "names")
# equivalent
df.select(pl.col("nrs"), pl.col("names"))
df.select(pl.col.nrs, pl.col.names)
shape: (5, 2)
nrsnames
i64str
1"foo"
2"ham"
3"spam"
null"egg"
5null
df.select(pl.all().exclude("random"))
shape: (5, 3)
nrsnamesgroups
i64strstr
1"foo""A"
2"ham""A"
3"spam""B"
null"egg""C"
5null"B"
# Select columns whose name matches regular expression regex.
df.select(pl.col("^n.*$"))
shape: (5, 2)
nrsnames
i64str
1"foo"
2"ham"
3"spam"
null"egg"
5null

Add New Columns - select() and with_columns()

df.select(NAMES=c.names)
df.select(c("names").alias("NAMES"))
shape: (5, 1)
NAMES
str
"foo"
"ham"
"spam"
"egg"
null
# Keep existing and add new columns with `with_columns`
df.with_columns((pl.col("random") * pl.col("nrs")).alias("product"))
df.with_columns(product=pl.col("random") * pl.col("nrs"))
shape: (5, 5)
nrsnamesrandomgroupsproduct
i64strf64strf64
1"foo"0.3"A"0.3
2"ham"0.7"A"1.4
3"spam"0.1"B"0.3
null"egg"0.9"C"null
5null0.6"B"3.0
# Add several new columns to the DataFrame
df.with_columns(
    product=(pl.col("random") * pl.col("nrs")),
    names_len_bytes=pl.col("names").str.len_bytes(),
)
shape: (5, 6)
nrsnamesrandomgroupsproductnames_len_bytes
i64strf64strf64u32
1"foo"0.3"A"0.33
2"ham"0.7"A"1.43
3"spam"0.1"B"0.34
null"egg"0.9"C"null3
5null0.6"B"3.0null
# Add a column 'index' that enumerates the rows
df.with_row_index()
shape: (5, 5)
indexnrsnamesrandomgroups
u32i64strf64str
01"foo"0.3"A"
12"ham"0.7"A"
23"spam"0.1"B"
3null"egg"0.9"C"
45null0.6"B"

Select rows - filter() and friends

# Filter: Extract rows that meet logical criteria.
df.filter(pl.col("random") > 0.5)
df.filter(c("random") > 0.5)
df.filter(c.random > 0.5)
shape: (3, 4)
nrsnamesrandomgroups
i64strf64str
2"ham"0.7"A"
null"egg"0.9"C"
5null0.6"B"
df.filter((pl.col("groups") == "B") & (pl.col("random") > 0.5))
shape: (1, 4)
nrsnamesrandomgroups
i64strf64str
5null0.6"B"
# Randomly select fraction of rows.
df.sample(fraction=0.5)
shape: (2, 4)
nrsnamesrandomgroups
i64strf64str
2"ham"0.7"A"
1"foo"0.3"A"
# Randomly select n rows.
df.sample(n=2)
shape: (2, 4)
nrsnamesrandomgroups
i64strf64str
3"spam"0.1"B"
null"egg"0.9"C"
# Select first n rows
df.head(n=2)

# Select last n rows.
df.tail(n=2)
shape: (2, 4)
nrsnamesrandomgroups
i64strf64str
null"egg"0.9"C"
5null0.6"B"

Select rows and columns

# Select rows 2-4
df[2:4, :]
shape: (2, 4)
nrsnamesrandomgroups
i64strf64str
3"spam"0.1"B"
null"egg"0.9"C"
# Select columns in positions 1 and 3 (first column is 0).
df[:, [1, 3]]
shape: (5, 2)
namesgroups
strstr
"foo""A"
"ham""A"
"spam""B"
"egg""C"
null"B"
# Select rows meeting logical condition, and only the specific columns.
(df.filter(pl.col("random") > 0.5).select("names", "groups"))
shape: (3, 2)
namesgroups
strstr
"ham""A"
"egg""C"
null"B"
# Select one columns as Series
print(type(df["names"]))
df["names"]
<class 'polars.series.series.Series'>
shape: (5,)
names
str
"foo"
"ham"
"spam"
"egg"
null

Sort rows - sort()

# Order rows by values of a column (high to low)
df.sort("random", descending=True)
shape: (5, 4)
nrsnamesrandomgroups
i64strf64str
null"egg"0.9"C"
2"ham"0.7"A"
5null0.6"B"
1"foo"0.3"A"
3"spam"0.1"B"
# Order by multiple rows
df.sort("groups", "random")
shape: (5, 4)
nrsnamesrandomgroups
i64strf64str
1"foo"0.3"A"
2"ham"0.7"A"
3"spam"0.1"B"
5null0.6"B"
null"egg"0.9"C"

Summarize Data

# Tuple of # of rows, # of columns in DataFrame
df.shape
(5, 4)
# number of rows in DataFrame
len(df)
df.height
5
# number of cols in DataFrame
df.width
4
# Count number of rows with each unique value of variable
df["groups"].value_counts()
shape: (3, 2)
groupscount
stru32
"A"2
"C"1
"B"2
# # of distinct values in a column
df["groups"].n_unique()
3
# Basic descriptive and statistics for each column
df.describe()
shape: (9, 5)
statisticnrsnamesrandomgroups
strf64strf64str
"count"4.0"4"5.0"5"
"null_count"1.0"1"0.0"0"
"mean"2.75null0.52null
"std"1.707825null0.319374null
"min"1.0"egg"0.1"A"
"25%"2.0null0.3null
"50%"3.0null0.6null
"75%"3.0null0.7null
"max"5.0"spam"0.9"C"
# Aggregation functions
df.select(
    # Sum values
    pl.sum("random").alias("sum"),
    # Minimum value
    pl.min("random").alias("min"),
    # Maximum value
    pl.max("random").alias("max"),
    # or
    pl.col("random").max().alias("other_max"),
    # Standard deviation
    pl.std("random").alias("std_dev"),
    # Variance
    pl.var("random").alias("variance"),
    # Median
    pl.median("random").alias("median"),
    # Mean
    pl.mean("random").alias("mean"),
    # Quantile
    pl.quantile("random", 0.75).alias("quantile_0.75"),
    # or
    pl.col("random").quantile(0.75).alias("other_quantile_0.75"),
    # First value
    pl.first("random").alias("first"),
)
shape: (1, 11)
summinmaxother_maxstd_devvariancemedianmeanquantile_0.75other_quantile_0.75first
f64f64f64f64f64f64f64f64f64f64f64
2.60.10.90.90.3193740.1020.60.520.70.70.3

Group And Aggregate Data - group_by() and agg()

# Group by values in column named "col", returning a GroupBy object
df.group_by("groups")
<polars.dataframe.group_by.GroupBy at 0x7f674d02c990>
# All of the aggregation functions from above can be applied to a group as well
df.group_by("groups").agg(
    # Sum values
    pl.sum("random").alias("sum"),
    # Minimum value
    pl.min("random").alias("min"),
    # Maximum value
    pl.max("random").alias("max"),
    # or
    pl.col("random").max().alias("other_max"),
    # Standard deviation
    pl.std("random").alias("std_dev"),
    # Variance
    pl.var("random").alias("variance"),
    # Median
    pl.median("random").alias("median"),
    # Mean
    pl.mean("random").alias("mean"),
    # Quantile
    pl.quantile("random", 0.75).alias("quantile_0.75"),
    # or
    pl.col("random").quantile(0.75).alias("other_quantile_0.75"),
    # First value
    pl.first("random").alias("first"),
)
shape: (3, 12)
groupssumminmaxother_maxstd_devvariancemedianmeanquantile_0.75other_quantile_0.75first
strf64f64f64f64f64f64f64f64f64f64f64
"B"0.70.10.60.60.3535530.1250.350.350.60.60.1
"C"0.90.90.90.9nullnull0.90.90.90.90.9
"A"1.00.30.70.70.2828430.080.50.50.70.70.3
# Additional GroupBy functions
(
    df.group_by("groups").agg(
        # Count the number of values in each group
        pl.count("random").alias("size"),
        # Sample one element in each group
        # (favour `map_elements` over `apply`)
        pl.col("names").map_elements(
            lambda group_df: group_df.sample(1).item(0), return_dtype=pl.String
        ),
    )
)
shape: (3, 3)
groupssizenames
stru32str
"A"2"foo"
"B"2null
"C"1"egg"
(df.group_by("groups").agg(pl.col("names").sample(1).alias("foo")))
shape: (3, 2)
groupsfoo
strlist[str]
"C"["egg"]
"B"[null]
"A"["foo"]

Reshaping Data - Changing Layout and Renaming

# Rename the columns of a DataFrame
df.rename({"nrs": "idx"})
shape: (5, 4)
idxnamesrandomgroups
i64strf64str
1"foo"0.3"A"
2"ham"0.7"A"
3"spam"0.1"B"
null"egg"0.9"C"
5null0.6"B"
# Drop columns from DataFrame
df.drop(["names", "random"])
shape: (5, 2)
nrsgroups
i64str
1"A"
2"A"
3"B"
null"C"
5"B"
df2 = pl.DataFrame(
    {
        "nrs": [6],
        "names": ["wow"],
        "random": [0.9],
        "groups": ["B"],
    }
)

df3 = pl.DataFrame(
    {
        "primes": [2, 3, 5, 7, 11],
    }
)
# Append rows of DataFrames.
pl.concat([df, df2])
shape: (6, 4)
nrsnamesrandomgroups
i64strf64str
1"foo"0.3"A"
2"ham"0.7"A"
3"spam"0.1"B"
null"egg"0.9"C"
5null0.6"B"
6"wow"0.9"B"
# Append columns of DataFrames
pl.concat([df, df3], how="horizontal")
shape: (5, 5)
nrsnamesrandomgroupsprimes
i64strf64stri64
1"foo"0.3"A"2
2"ham"0.7"A"3
3"spam"0.1"B"5
null"egg"0.9"C"7
5null0.6"B"11
# Spread rows into columns
df.pivot(values="nrs", index="groups", columns="names")
/tmp/ipykernel_20290/3836154377.py:2: DeprecationWarning: The argument `columns` for `DataFrame.pivot` is deprecated. It has been renamed to `on`.
  df.pivot(values="nrs", index="groups", columns="names")
shape: (3, 6)
groupsfoohamspameggnull
stri64i64i64i64i64
"A"12nullnullnull
"B"nullnull3null5
"C"nullnullnullnullnull
# Gather columns into rows (this is equivalent to the deprecated melt() function)
df.unpivot(index="nrs", on=["names", "groups"])
shape: (10, 3)
nrsvariablevalue
i64strstr
1"names""foo"
2"names""ham"
3"names""spam"
null"names""egg"
5"names"null
1"groups""A"
2"groups""A"
3"groups""B"
null"groups""C"
5"groups""B"

Reshaping Data - Joining Data Sets

df4 = pl.DataFrame(
    {
        "nrs": [1, 2, 5, 6],
        "animals": ["cheetah", "lion", "leopard", "tiger"],
    }
)
# Inner join
# Retains only rows with a match in the other set.
df.join(df4, on="nrs")
# or
df.join(df4, on="nrs", how="inner")
shape: (3, 5)
nrsnamesrandomgroupsanimals
i64strf64strstr
1"foo"0.3"A""cheetah"
2"ham"0.7"A""lion"
5null0.6"B""leopard"
# Left join
# Retains each row from "left" set (df).
df.join(df4, on="nrs", how="left")
shape: (5, 5)
nrsnamesrandomgroupsanimals
i64strf64strstr
1"foo"0.3"A""cheetah"
2"ham"0.7"A""lion"
3"spam"0.1"B"null
null"egg"0.9"C"null
5null0.6"B""leopard"
# Outer join
# Retains each row, even if no other matching row exists.
df.join(df4, on="nrs", how="outer")
/tmp/ipykernel_20290/1617706674.py:3: DeprecationWarning: Use of `how='outer'` should be replaced with `how='full'`.
  df.join(df4, on="nrs", how="outer")
shape: (6, 6)
nrsnamesrandomgroupsnrs_rightanimals
i64strf64stri64str
1"foo"0.3"A"1"cheetah"
2"ham"0.7"A"2"lion"
3"spam"0.1"B"nullnull
null"egg"0.9"C"nullnull
5null0.6"B"5"leopard"
nullnullnullnull6"tiger"
# Anti join
# Contains all rows from df that do not have a match in df4.
df.join(df4, on="nrs", how="anti")
shape: (2, 4)
nrsnamesrandomgroups
i64strf64str
3"spam"0.1"B"
null"egg"0.9"C"

Misc

Changing Entries That Match Certain Criteria

Use the when().then().otherwise() chain for this.

df.with_columns(
    pl.when((c("groups") == "A") & (c("names") == "foo"))
    .then(9.0)
    .otherwise(c.random)
    .alias("random")
)
shape: (5, 4)
nrsnamesrandomgroups
i64strf64str
1"foo"9.0"A"
2"ham"0.7"A"
3"spam"0.1"B"
null"egg"0.9"C"
5null0.6"B"

Handling Missing Data

# How many nulls per column?
df.null_count()
shape: (1, 4)
nrsnamesrandomgroups
u32u32u32u32
1100
# Drop rows with any column having a null value
df.drop_nulls()
shape: (3, 4)
nrsnamesrandomgroups
i64strf64str
1"foo"0.3"A"
2"ham"0.7"A"
3"spam"0.1"B"
# Replace null values with given value
df.fill_null(42)
shape: (5, 4)
nrsnamesrandomgroups
i64strf64str
1"foo"0.3"A"
2"ham"0.7"A"
3"spam"0.1"B"
42"egg"0.9"C"
5null0.6"B"
# Replace null values using forward strategy
df.fill_null(strategy="forward")
# Other fill strategies are "backward", "min", "max", "mean", "zero" and "one"
shape: (5, 4)
nrsnamesrandomgroups
i64strf64str
1"foo"0.3"A"
2"ham"0.7"A"
3"spam"0.1"B"
3"egg"0.9"C"
5"egg"0.6"B"
# Replace floating point NaN values with given value
df.fill_nan(42)
shape: (5, 4)
nrsnamesrandomgroups
i64strf64str
1"foo"0.3"A"
2"ham"0.7"A"
3"spam"0.1"B"
null"egg"0.9"C"
5null0.6"B"

Rolling Functions

# The following rolling functions are available
import numpy as np

df.select(
    pl.col("random"),
    # Rolling maximum value
    pl.col("random").rolling_max(window_size=2).alias("rolling_max"),
    # Rolling mean value
    pl.col("random").rolling_mean(window_size=2).alias("rolling_mean"),
    # Rolling median value
    pl.col("random")
    .rolling_median(window_size=2, min_periods=2)
    .alias("rolling_median"),
    # Rolling minimum value
    pl.col("random").rolling_min(window_size=2).alias("rolling_min"),
    # Rolling standard deviation
    pl.col("random").rolling_std(window_size=2).alias("rolling_std"),
    # Rolling sum values
    pl.col("random").rolling_sum(window_size=2).alias("rolling_sum"),
    # Rolling variance
    pl.col("random").rolling_var(window_size=2).alias("rolling_var"),
    # Rolling quantile
    pl.col("random")
    .rolling_quantile(quantile=0.75, window_size=2, min_periods=2)
    .alias("rolling_quantile"),
    # Rolling skew
    pl.col("random").rolling_skew(window_size=2).alias("rolling_skew"),
    # Rolling custom function
    pl.col("random")
    .rolling_map(function=np.nanstd, window_size=2)
    .alias("rolling_apply"),
)
shape: (5, 11)
randomrolling_maxrolling_meanrolling_medianrolling_minrolling_stdrolling_sumrolling_varrolling_quantilerolling_skewrolling_apply
f64f64f64f64f64f64f64f64f64f64f64
0.3nullnullnullnullnullnullnullnullnullnull
0.70.70.50.50.30.2828431.00.080.7-4.3368e-160.2
0.10.70.40.40.10.4242640.80.180.73.8549e-160.3
0.90.90.50.50.10.5656851.00.320.90.00.4
0.60.90.750.750.60.2121321.50.0450.90.00.15

Window Functions

# Window functions allow to group by several columns simultaneously
df.select(
    "names",
    "groups",
    "random",
    pl.col("random").sum().over("names").alias("sum_by_names"),
    pl.col("random").sum().over("groups").alias("sum_by_groups"),
)
shape: (5, 5)
namesgroupsrandomsum_by_namessum_by_groups
strstrf64f64f64
"foo""A"0.30.31.0
"ham""A"0.70.71.0
"spam""B"0.10.10.7
"egg""C"0.90.90.9
null"B"0.60.60.7

Date Range Creation

# create data with pl.date_range
from datetime import date


df_date = pl.DataFrame(
    {
        # eager=True is important to turn the expression into actual data
        "date": pl.date_range(
            date(2024, 1, 1), date(2024, 1, 7), interval="1d", eager=True
        ),
        "value": [1, 2, 3, 4, 5, 6, 7],
    }
)
df_date
shape: (7, 2)
datevalue
datei64
2024-01-011
2024-01-022
2024-01-033
2024-01-044
2024-01-055
2024-01-066
2024-01-077

Time-based Upsampling - group_by_dynamic()

group_by_dynamic has many useful options.

[docs]

(df_date.group_by_dynamic("date", every="1w").agg(c.value.sum()))
shape: (1, 2)
datevalue
datei64
2024-01-0128

Custom Expressions

You can create your own expression and reuse them throughout your projects.

def normalize_str_col(col_name: str) -> pl.Expr:
    return pl.col(col_name).str.to_lowercase().str.replace_all(" ", "_")


df.select(new_col=normalize_str_col(col_name="groups"))
shape: (5, 1)
new_col
str
"a"
"a"
"b"
"c"
"b"

Plotting

Polars does not implement plotting itself, but delegates to hvplot. The plot functions are available via the .plot accessor.

[docs]

df
shape: (5, 4)
nrsnamesrandomgroups
i64strf64str
1"foo"0.3"A"
2"ham"0.7"A"
3"spam"0.1"B"
null"egg"0.9"C"
5null0.6"B"
(
    df.group_by("groups", maintain_order=True)
    .agg(pl.sum("random"))
    .plot.bar(x="groups", y="random")
)
%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)

Debugging

If you have a long chain of transformations, it can be handy to look at / log intemediate steps. Write little helpers to make this easy and call them via pipe().

def log_df(df: pl.DataFrame, prefix="") -> pl.DataFrame:
    print(f"{prefix}shape:{df.shape}  schema: {dict(df.schema)}")
    return df


(
    df.pipe(log_df, "step 1: ")
    .filter(c.random > 0.5)
    .pipe(log_df, "step 2: ")
    .select("names")
    .pipe(log_df, "step 3: ")
)
step 1: shape:(5, 4)  schema: {'nrs': Int64, 'names': String, 'random': Float64, 'groups': String}
step 2: shape:(3, 4)  schema: {'nrs': Int64, 'names': String, 'random': Float64, 'groups': String}
step 3: shape:(3, 1)  schema: {'names': String}
shape: (3, 1)
names
str
"ham"
"egg"
null

Eager, lazy, out-of-core

Lazy mode allows optimization of the query plan.

Out-of-core or streaming allows to work with data that is bigger than the RAM.

# eager loading, lazy execution
data = pl.read_parquet("df.parquet")
(
    data.lazy()
    .group_by("groups")
    .agg(pl.sum("random").alias("total"))
    .sort("total")
    # till here nothing really happened
    .collect()  # now we execute the plan and collect the results
)
shape: (3, 2)
groupstotal
strf64
"B"0.7
"C"0.9
"A"1.0
# lazy loading, lazy execution
data = pl.scan_parquet("df.parquet")
(
    data.lazy()
    .group_by("groups")
    .agg(pl.sum("random").alias("total"))
    .sort("total")
    # till here nothing really happened
    # with the next line, we execute the plan and collect the results
    .collect()
)
shape: (3, 2)
groupstotal
strf64
"B"0.7
"C"0.9
"A"1.0
# stream data
data = pl.scan_parquet("df.parquet")
(
    data.lazy()
    .group_by("groups")
    .agg(pl.sum("random").alias("total"))
    .sort("total")
    # till here nothing really happened
    # with the next line, we execute the plan in a streaming fashion
    .collect(streaming=True)
)
shape: (3, 2)
groupstotal
strf64
"B"0.7
"C"0.9
"A"1.0

Data Validation with Pandera

Since 0.19 pandera offers polars support. That means you can validate the schema and data of your polars DataFrame.

This is just a sneak peak, read the docs for more.

import pandera.polars as pa


# define your schema in as much detail as you want


class MySchema(pa.DataFrameModel):
    nrs: int
    names: str  # or pl.String
    # different range
    random: float = pa.Field(in_range={"min_value": 1.0, "max_value": 2.0})
    # C is not allowed
    groups: str = pa.Field(isin=["A", "B"])

    class Config:
        # All existing columns must be listed in the schema
        strict = True
# Then validate it.
# Use lazy=True to run all validations before throwing the SchemaErrors
try:
    MySchema.validate(df, lazy=True)
except pa.errors.SchemaErrors as e:
    print("Got SchemaErrors exception.")
    print(e)
Got SchemaErrors exception.
{
    "SCHEMA": {
        "SERIES_CONTAINS_NULLS": [
            {
                "schema": "MySchema",
                "column": "nrs",
                "check": "not_nullable",
                "error": "non-nullable column 'nrs' contains null values"
            },
            {
                "schema": "MySchema",
                "column": "names",
                "check": "not_nullable",
                "error": "non-nullable column 'names' contains null values"
            }
        ]
    },
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": "MySchema",
                "column": "random",
                "check": "in_range(1.0, 2.0)",
                "error": "Column 'random' failed validator number 0: <Check in_range: in_range(1.0, 2.0)> failure case examples: [{'random': 0.3}, {'random': 0.7}, {'random': 0.1}, {'random': 0.9}, {'random': 0.6}]"
            },
            {
                "schema": "MySchema",
                "column": "groups",
                "check": "isin(['A', 'B'])",
                "error": "Column 'groups' failed validator number 0: <Check isin: isin(['A', 'B'])> failure case examples: [{'groups': 'C'}]"
            }
        ]
    }
}