Here, you will find a concise guide to using the powerful data manipulation library, Pandas. It covers a range of essential Pandas functions: data loading, manipulation, filtering, grouping, merging, and more. It's a cheat sheet designed to be easy to use, while still providing comprehensive information
PREVIEW DATASET, LOAD DATASET, CREATE DATAFRAME
# Preview top lines before loading :
with open (datasetfile, 'r') as f:
print(f.readlines()[:10]) # Extract the first 10 lines
# Load Data Set
# json, excel;
datasetfile=r'C:\Users\....json'
df = pd.read_json(datasetfile)
df= pd.read_excel(datasetfile,sheet_name='sheet',index_col=None)
df= pd.read_excel(datasetfile,sheet_name='sheet',index_col=0) # controls where is the index
# csv
header_list = ["Name", "Dept", "Start Date"]
df = pd.read_csv("sample_file.csv", names=header_list)
df = pd.read_csv('file.csv',sep=';', header=None, index_col='col',
parse_dates=['TIME'],parse_dates=True, encoding='UTF-8', dtype={'col1':str,'col2':int},
usecols=columnsList, skiprows = rowslist, nrows=5, on_bad_lines='skip')
# create dataset from lists
headers=['planet','period of revolution','distance to the sun']
earth=['earth',365,150.10]
venus=['venus',225,108.10]
mars=['mars',687,228]
data=[earth,venus,mars]
df = pd.DataFrame(data, columns=headers)
# create dataset from dictionaries
data = [
{'Name': 'John', 'Age': 28, 'City': 'New York'},
{'Name': 'Emma', 'Age': 24, 'City': 'London'},
{'Name': 'Mike', 'Age': 32, 'City': 'Paris'}
]
df = pd.DataFrame(data)
or
data = {'year':[2021,2022,2023,2024],
'value1':[10,12,3,5],'value2':['a',12,3,4],
'value3':[None,12,3,4]}
df = pd.DataFrame(data)
DATA OVERVIEW
# Show all columns
pd.set_option('display.max_columns', None)
# Data overview
print(df.shape) # rows, cols
print(df.head(n)) # first n rows
print(df.tail(n)) # last n rows
print(df.head().iloc[:, :p]) # first n rows and first p columns
print(df.info()) # entries , non-null, types , columns names
print(df.describe()) # statistical summary
print(df.size) # entries even NaN
print(len(df.index))
print(list(df.columns)) # liste des headers
# Count of unique values of each column
df.nunique()
# Count of unique types
df['value'].apply(type).nunique()
# Get the unique set of types
df['value'].apply(type).unique()
# Get data types for each column
df.dtypes
# Data stats overview after group by
df.groupby(['country_name','country_code'])['value'].describe()
# Quick bar charts to get another overview
df.plot.bar(x='year') # Bar chart , one bar a row
plt.hist(df['values', bins=10, color='blue', edgecolor='black') # histogram
df.groupby('country')['value'].sum().plot(kind="bar",color="orange") # Bar chart after 1 groupby
df.groupby('year')['value'].count().plot(kind="bar",color="orange") # Bar chart after 1 groupby
NaN VALUES
# Checking for NaN values in a specific column.
df['values'].isna().any()
# Counting Nan values in a specific column
df['values'].isna().sum()
# Removing rows containing missing values in a specific column
df = df.dropna(subset=['values'])
CHANGE DATA TYPES
# Change to integer type :
df['values1'] = df['values1'].astype(int)
RESHAPE DATAFRAME
# Transpose dataframe
df = df.transpose()
# Reshape : pivot
df.pivot(index='Year', columns='Country', values='some_values')
EDIT VALUES
# edit a value
df.iloc[2, 2] = 'new value'
INDEX : SET, RESET, RENAME
# Set / Reset an index column
df = df.set_index('time-tag') # uses 'time-tag' column as an index
df = df.set_index(df.columns[0])
df.reset_index(inplace=True)
df = df.reset_index(drop = False) # adds a new column with the index values as the first column
# Rename index
df = df.rename_axis('nanme')
TIMESERIES
# TimeSeries : check if 'time' column is recognized as datetime
print('IS DATATIME',df['time'].dtype == 'datetime64[ns]')
# TimeSeries : convert index to DatetimeIndex
df.index = pd.to_datetime(df.index)
# TimeSeries : Resample DataFrame to month, week, year..
df=df.resample('M').mean()
# TimeSeries : filter the DataFrame from a specific date
start_date_str = '1950-01-01' # string
start_date = pd.to_datetime(start_date_str)
df = df[df.index >= start_date]
# TimeSeries : Set x-axis ticks every n days, if index.
tick_spacing = 3650
# TimseSeries : change the date index format to YYYY-MM
df.index.strftime('%Y-%m')
# Timeseries : combine the year and month columns into a new datetime column 'date'
df107['date'] = pd.to_datetime(df107['Year'].astype(str) + '-' + df107['Month'].astype(str))
COLUMNS : RENAME, ADD, DELETE
# Change columns name
df.rename(columns={'old':'new'},inplace = True)
df.rename(columns={df.columns[0]:'new'},inplace=True)
df.columns = ['name'] + df.columns.tolist()[1:]
# Add a column from list
df['New Column'] = my_list
# Add a calculated column
df['New Calculated Column'] = df['value'] / 100
# Add a cumulative sum column
df['Ccumulative Column'] = df['value'].cumsum()
# Add a bins column
df['bins'] = pd.cut(df['Value'], bins = 3)
df['bins'] = pd.cut(df['Value'], bins = 3, labels = [1,2,3])
df['bins'] = pd.cut(df['Value'], bins = [0, 40, 80, 120])
df['bins'] = pd.cut(df['Value'], bins = [0, 40, 80, 120], labels = [1,2,3])
# Delete rows / columns
df = df.drop('row2', axis=0)
df = df.drop('colB', axis=1)
df = df.drop(df.columns[[0, 2]], axis=1)
df = df.drop(columns=['B', 'C'])
COLUMNS : WORKING WITH DATA
# Get the name of the nth column
df.columns[n-1]
# Get the unique values of a column
df['country_name'].unique()
# Get max value of a column
df['value'].max()
# Get the list of values of a column
df['country_name'].tolist()
# Get percentile : 80% of datapoints
df_sorted = df.sort_values(by='sy_dist')
percentile_80 = df_sorted['value'].quantile(0.8)
# Correlations
correlation = df1['col1'].corr(df2['col2'])
COLUMNS : SELECTION
# Selecting columns
df = df.loc[:, df.columns.isin(COUNTRIES)]
# Select data by rows / columns
df['column label']
df.loc[row1_label]
df.loc[row11_label, column1_label]
df.iloc[0] # row0
df.iloc[[1,3,10]] # rows 1 ,3 ,10
df.iloc[:, 1] # column1
df.iloc[[0, 2], [1, 2]] # subsets rows 0,2 and columns 1,2
FILTERING ROWS
# Filtering rows
df = df[df['value'] == some_value]
df = df[df['value'] > some_value]
df = df[(df['country']=='country1') & (df['Year']=='2022')]
df = df[df['country_name'].isin(listofcountries)]
df = df[df['year'].between(2015,2020)]
df = df[df.index=='2023-01-01']
df = df[df['TIME']>pd.to_datetime('2019')]
df = df[df[0].str.contains('string')]
# Filtering rows with query
# column names with spaces or special characters should be enclosed in backticks (`)
df = df.query("value1 > 5")
df = df.query("value2 == 'a'")
df = df.query("2022 <= year <= 2023")
df = df.query("value3.notna()")
selection = [2022, 2024]
result = df.query("year in @selection")
SORTS
df = df.sort_values(['colA', 'colB'], ascending = (False, True))
GROUP BY
# Group by and keep several columns :
df.groupby(['country_name','country_code'])['value'].sum()
# Group by and agregate
df.groupby('country').agg({'value1': 'mean', 'value2': 'sum'})
# Group by and keep top n values
df.groupby(['country_name'])['value'].sum().sort_values(ascending = False)[0:n]
# Group by and get the list of categories for the top 10 values
df.groupby('category')['values'].count().sort_values(ascending = False).head().index.tolist()
# Group by : get one line per group
df.groupby("Category").first()
df.groupby("Category").last()
df.groupby("Category").nth()
# Group by : get one group
df_group.get_group('Category')
# Group by : statistical summary
df.groupby("category")[["Quantity"]].describe()
CONCATENATE, UNIONS
# Concatenate :
# vertical stack and reset index
df3 = pd.concat([df1,df2], axis = 0 ).reset_index()
# horizontal stack
df4 = pd.concat([df1,df2], axis = 1)
# Unions
descriptions = pd.DataFrame({
'material': ['material1', 'material2', 'material3'],
'description': ['descr1', 'descr2', 'descr3']})
prices = pd.DataFrame({
'material': ['material1', 'material3'],
'price': ['price1', 'price2'] })
case1_result = pd.merge(descriptions, prices, on='material', how='left')
case2_result = pd.merge(prices, descriptions, on='material', how='left')
print(case1_result)
print(case2_result)
Case 1:
material description price
0 material1 descr1 price1
1 material2 descr2 NaN
2 material3 descr3 price2
Case 2:
material price description
0 material1 price1 descr1
1 material3 price2 descr3
# Union with data expansion
groups = pd.DataFrame({'customer': ['customer1', 'customer2','customer3'],
'group': ['group1', 'group1', 'group2']})
prices = pd.DataFrame({
'group': ['group1', 'group1', 'group2', 'group2'],
'material': ['material1', 'material2', 'material1', 'material4'],
'price': ['price1', 'price2', 'price3', 'price4']})
final_table = pd.merge(groups, prices, on='group')
CHARTS
# Chart : add a title
ax = df.plot(kind='line')
ax.set_title('My Title',fontweight='bold')
# Chart : remove x label
ax.set_xlabel('')
# Chart : bars
df.plot(kind="bar",color="orange")
# Chart : stacked bars
df_pivot = df.pivot(index='year', columns='country_name', values='value')
ax = df_pivot.plot.bar(stacked=True, figsize=(10,6))
# Chart : stacked bars for top 10
df_grouped=df.groupby(['country_name'])['value'].sum()
df_top10 = df_grouped.sort_values(ascending=False).head(10)
df_filtered=df[df['country_name'].isin(df_top10.index)]
df_pivot=df_filtered.pivot(index='year', columns='country_name', values='value')
ax = df_pivot.plot.bar(stacked=True)
# Chart : configure fig with matplotlib
fig = plt.gcf()
# Chart : after group by ( data object is a series)
df = df.groupby('year')['name'].count()
ax.bar(df.index, df.values, color = 'white')
# Chart : show
# because df.plot() returns an instance of a Matplotlib AxesSubplot
import matplotlib.pyplot as plt
plt.show()
# Chart : Pareto for a dataframe with values, categories.
# If there's no category, it can be created with bins using pd.cut(), see above
# assign dataframe column headers to variables
category = 'Category'
value = 'Value'
# evaluate total value
total = df[value].sum()
# group by
serie_sum_par_cat = df.groupby(category)[value].sum().sort_values(ascending = False)
# series object to dataframe object :
df_sum_par_cat = serie_sum_par_cat.reset_index()
# add cumulative sum column
df_sum_par_cat['cumul'] = df_sum_par_cat[value].cumsum() / total * 100
print(df_sum_par_cat)
# chart
fig, ax1 = plt.subplots()
ax1.bar(df_sum_par_cat[category],df_sum_par_cat[value])
ax1.set_ylabel(('value label'))
ax1.set_xlabel('by category')
ax2 = ax1.twinx()
ax2.plot(df_sum_par_cat[category],df_sum_par_cat['cumul'],color='red')
ax2.set_ylabel('Cumul % of'+ value)
plt.show()
# Chart : boxplot / box and whiskers - one box a year
grouped = df.groupby('year')
sorted_years = sorted(df['year'].unique())
data_per_year = [grouped.get_group(year)['value'] for year in sorted_years]
bplot = ax.boxplot(data_per_year,
vert = True, #
patch_artist = True, #
labels = sorted_years)
for patch in bplot['boxes']:
patch.set_facecolor('white')
for whisker in bplot['whiskers']:
whisker.set(color = 'white')
for flier in bplot['fliers']:
flier.set(marker = 'o', color='white', markerfacecolor = 'white')
for cap in bplot['caps']:
cap.set(color = 'white')
plt.xticks(rotation=45)
plt.show()
SAVE
# Save as excel, csv, txt
df.to_excel(filename.xlsx,sheet_name='name',index=False)
df.to_csv(filename.csv,index=False)
df.to_csv(filename.txt,sep=';',index=False)
APPLY FUNCTIONS AND OPERATIONS
# Apply functions with .pipe()
def square_values(df, column_name):
df[column_name] = df[column_name] ** 2
return df
def divide_values(df, column_name):
df[column_name] = df[column_name] / 2
return df
result = (df
.pipe(divide_values, 'values1')
.pipe(square_values, 'values1')
)
# Operations
df['values1'] = df['values1'] / 2
Example #1. Reading an excel crosstab table and reshaping in a crosstab table :
# initial index is 1st column
df= pd.read_excel(datasetfile, sheet_name='Table 9',
skiprows=skiprows,index_col=0)
# transpose data :
df = df.transpose()
# reset index :
df = df.reset_index(drop=False)
# set a new index
df = df.set_index(df.columns[0])
# assignes a name or header to the index :
df = df.rename_axis('year')
# keep only the columns whose headers are present in a predefined list of countries
df = df.loc[:, df.columns.isin(COUNTRIES)]
print(df.head().iloc[:, :5])
Example #2: Unpivoting
From Pivot / Crosstab :
Customer1 Customer2
2023 5000 2000
2022 4000 3000
To Long Format / Tidy Format:
Year Customer Value
0 2023 Customer1 5000
1 2022 Customer1 4000
2 2023 Customer2 2000
3 2022 Customer2 3000
# Create Crosstab dataframe
data = {'Customer1': [5000, 4000], 'Customer2': [2000, 3000]}
df = pd.DataFrame(data, index=[2023, 2022])
# melt
melted_df = df.reset_index().melt(id_vars='index', var_name='Customer', value_name='Value')
# Rename
melted_df.columns = ['Year', 'Customer', 'Value']
Example #3 : exploring groupby object
Understand its structure without performing common operations like sum and mean. In this script, 'df' is a DataFrame with one row per product, four categories labeled as A, B, C, and D, and a 'values' column. This script illustrates that the GroupBy object serves as a container holding a tuple for each category. Within this tuple, you'll find a DataFrame and the category name.
grouped = df.groupby(['Category'])
print(type(grouped)) # prints : 'pandas.core.groupby.generic.DataFrameGroupBy'
print(len(grouped)) # prints : 4
for group in grouped:
print(type(group), type(group[0]), type(group[1]) )
# prints 4 times :
# class 'tuple' class 'str' class 'pandas.core.frame.DataFrame'