Pandas Cheatsheet
This page maps common pandas operations to equivalent Pivotal patterns. It is
aimed at pandas users who already know what they want to do and need the
Pivotal code for the same operation.
The examples assume a pandas DataFrame named sales unless the operation loads
or combines data.
Load and Inspect
| Task |
pandas |
Pivotal |
| Read a CSV |
sales = pd.read_csv("sales.csv")
|
load "sales.csv" as sales
|
| Read with options |
sales = pd.read_csv(
"sales.csv",
sep=";",
header=0,
)
|
load "sales.csv" as sales
sep ";"
header 0
|
| Show rows |
|
|
| Summary statistics |
|
|
Rows and Columns
| Task |
pandas |
Pivotal |
| Filter rows |
active = sales[sales["status"] == "active"]
|
with sales as active
filter status == "active"
|
| Filter with multiple conditions |
high_value = sales[
(sales["status"] == "active")
& (sales["revenue"] > 1000)
]
|
with sales as high_value
filter status == "active"
filter revenue > 1000
|
| Filter with a list |
east_west = sales[
sales["region"].isin(["East", "West"])
]
|
with sales as east_west
filter region in ["East", "West"]
|
| Select columns |
report = sales[["order_id", "region", "revenue"]]
|
with sales as report
select order_id, region, revenue
|
| Drop columns |
clean = sales.drop(
columns=["debug_flag", "notes"]
)
|
with sales as clean
drop debug_flag, notes
|
| Rename columns |
renamed = sales.rename(
columns={
"qty": "quantity",
"rev": "revenue",
}
)
|
with sales as renamed
rename qty as quantity, rev as revenue
|
| Sort rows |
top = sales.sort_values(
"revenue",
ascending=False,
)
|
with sales as top
sort revenue desc
|
| Remove duplicate rows |
unique_sales = sales.drop_duplicates()
|
with sales as unique_sales
distinct
|
| Remove duplicates by selected columns |
unique_products = sales.drop_duplicates(
["product", "category"]
)
|
with sales as unique_products
distinct product, category
|
Create and Clean Columns
| Task |
pandas |
Pivotal |
| Create a column |
sales["margin"] = sales["revenue"] - sales["cost"]
|
with sales
margin = revenue - cost
|
| Create a conditional column |
sales["tier"] = np.where(
sales["revenue"] > 1000,
"high",
"standard",
)
|
with sales
tier = "high"
where revenue > 1000
else "standard"
|
| Fill missing values |
clean = sales.fillna({
"region": "Unknown",
"revenue": 0,
})
|
with sales as clean
fillna
region = "Unknown"
revenue = 0
|
| Drop rows with missing values |
complete = sales.dropna(
subset=["customer_id", "revenue"]
)
|
with sales as complete
dropna customer_id, revenue
|
| Cast a column |
sales["order_date"] = pd.to_datetime(
sales["order_date"]
)
|
with sales
cast order_date as datetime
|
| Round values |
sales["revenue"] = sales["revenue"].round(2)
|
with sales
round revenue 2
|
Aggregation and Reshaping
| Task |
pandas |
Pivotal |
| Group and sum |
by_region = (
sales
.groupby("region", as_index=False)["revenue"]
.sum()
)
|
with sales as by_region
group by region
agg sum revenue as revenue
|
| Group with several aggregations |
summary = (
sales
.groupby("region", as_index=False)
.agg(
total=("revenue", "sum"),
avg_order=("revenue", "mean"),
orders=("order_id", "count"),
)
)
|
with sales as summary
group by region
agg sum revenue as total
agg mean revenue as avg_order
agg count order_id as orders
|
| Aggregate all rows |
totals = sales.agg(
total=("revenue", "sum"),
avg=("revenue", "mean"),
)
|
with sales as totals
agg sum revenue as total, mean revenue as avg
|
| Pivot table |
wide = sales.pivot_table(
index="product",
columns="region",
values="revenue",
aggfunc="sum",
)
|
with sales as wide
pivot
rows product
cols region
agg sum revenue
|
| Melt wide data to long |
long = monthly_sales.melt(
id_vars=["region"],
value_vars=["jan", "feb", "mar"],
var_name="month",
value_name="revenue",
)
|
with monthly_sales as long
unpivot
id region
cols jan, feb, mar
variable "month"
value "revenue"
|
Combine Tables
| Task |
pandas |
Pivotal |
| Left join |
enriched = sales.merge(
customers,
on="customer_id",
how="left",
)
|
with sales as enriched
left merge customers on customer_id
|
| Join with different key names |
enriched = sales.merge(
customers,
left_on="customer_id",
right_on="id",
how="left",
)
|
with sales as enriched
left merge customers
left_on customer_id
right_on id
|
| Stack rows |
all_sales = pd.concat([
jan_sales,
feb_sales,
mar_sales,
])
|
with jan_sales as all_sales
concat feb_sales, mar_sales
|
Window-Style Operations
| Task |
pandas |
Pivotal |
| Rank within groups |
sales["regional_rank"] = (
sales.groupby("region")["revenue"]
.rank(ascending=False)
)
|
with sales
rank revenue desc as regional_rank
by region
|
| Lag a value |
sales = sales.sort_values("date")
sales["prev_revenue"] = (
sales.groupby("region")["revenue"]
.shift(1)
)
|
with sales
lag revenue 1 as prev_revenue
by region
order date
|
| Running total |
sales = sales.sort_values("date")
sales["running_revenue"] = (
sales.groupby("region")["revenue"]
.cumsum()
)
|
with sales
cumsum revenue as running_revenue
by region
order date
|
| Rolling average |
sales = sales.sort_values("date")
sales["rolling_avg"] = (
sales.groupby("region")["revenue"]
.rolling(7)
.mean()
.reset_index(level=0, drop=True)
)
|
with sales
rolling mean revenue 7 as rolling_avg
by region
order date
|
Python Variables and Functions
Use : when a Pivotal statement should read a Python object from the surrounding
notebook or script.
| Task |
pandas |
Pivotal |
| Use a Python threshold |
threshold = 1000
top = sales[sales["revenue"] > threshold]
|
with sales as top
filter revenue > :threshold
|
| Use a Python column list |
cols = ["order_id", "region", "revenue"]
report = sales[cols]
|
with sales as report
select :cols
|
| Apply a Python function to the table |
clean = clean_sales(sales)
|
with sales as clean
apply :clean_sales
|
For exact command forms and less common options, see the
Syntax Reference.