PANDAS


Here, you will find a concise guide to using the powerful data manipulation library, Pandas. It covers a range of essential Pandas functions: data loading, manipulation, filtering, grouping, merging, and more. It's a cheat sheet designed to be easy to use, while still providing comprehensive information


PREVIEW DATASET, LOAD DATASET, CREATE DATAFRAME

# Preview top lines before loading :
with open (datasetfile, 'r') as f:  
    print(f.readlines()[:10]) # Extract the first 10 lines

# Load Data Set
# json, excel;
datasetfile=r'C:\Users\....json'
df = pd.read_json(datasetfile)
df= pd.read_excel(datasetfile,sheet_name='sheet',index_col=None)
df= pd.read_excel(datasetfile,sheet_name='sheet',index_col=0) # controls where is the index
# csv
header_list = ["Name", "Dept", "Start Date"]
df = pd.read_csv("sample_file.csv", names=header_list)
df = pd.read_csv('file.csv',sep=';', header=None, index_col='col',
parse_dates=['TIME'],parse_dates=True, encoding='UTF-8', dtype={'col1':str,'col2':int},
usecols=columnsList, skiprows = rowslist, nrows=5, on_bad_lines='skip')

# create dataset from lists
headers=['planet','period of revolution','distance to the sun']
earth=['earth',365,150.10]
venus=['venus',225,108.10]
mars=['mars',687,228]
data=[earth,venus,mars]
df = pd.DataFrame(data, columns=headers)

# create dataset from dictionaries
data = [
    {'Name': 'John', 'Age': 28, 'City': 'New York'},
    {'Name': 'Emma', 'Age': 24, 'City': 'London'},
    {'Name': 'Mike', 'Age': 32, 'City': 'Paris'}
]
df = pd.DataFrame(data)

or 

data = {'year':[2021,2022,2023,2024],
	'value1':[10,12,3,5],'value2':['a',12,3,4],
	'value3':[None,12,3,4]}
df = pd.DataFrame(data)


DATA OVERVIEW

# Show all columns
pd.set_option('display.max_columns', None)

# Data overview
print(df.shape) # rows, cols
print(df.head(n)) # first n rows
print(df.tail(n)) # last n rows
print(df.head().iloc[:, :p]) # first n rows and first p columns
print(df.info()) # entries , non-null, types , columns names
print(df.describe()) # statistical summary
print(df.size) # entries even NaN
print(len(df.index)) 
print(list(df.columns)) # liste des headers

# Count of unique values of each column
df.nunique()

# Count of unique types
df['value'].apply(type).nunique()

# Get the unique set of types
df['value'].apply(type).unique()

# Get data types for each column 
df.dtypes

# Data stats overview after group by
df.groupby(['country_name','country_code'])['value'].describe()

# Quick bar charts to get another overview
df.plot.bar(x='year') # Bar chart , one bar a row
plt.hist(df['values', bins=10, color='blue', edgecolor='black') # histogram
df.groupby('country')['value'].sum().plot(kind="bar",color="orange") # Bar chart after 1 groupby
df.groupby('year')['value'].count().plot(kind="bar",color="orange") # Bar chart after 1 groupby

NaN VALUES

# Checking for NaN values in a specific column.
df['values'].isna().any()

# Counting Nan values in a specific column
df['values'].isna().sum()

# Removing rows containing missing values in a specific column
df = df.dropna(subset=['values'])

CHANGE DATA TYPES

# Change to integer type :
df['values1'] = df['values1'].astype(int)

RESHAPE DATAFRAME

# Transpose dataframe
df = df.transpose()

# Reshape : pivot
df.pivot(index='Year', columns='Country', values='some_values')

EDIT VALUES

# edit a value
df.iloc[2, 2] = 'new value'

INDEX : SET, RESET, RENAME

# Set / Reset an index column
df = df.set_index('time-tag') # uses 'time-tag' column as an index 
df = df.set_index(df.columns[0]) 
df.reset_index(inplace=True)
df = df.reset_index(drop = False) # adds a new column with the index values as the first column

# Rename index
df = df.rename_axis('nanme')

TIMESERIES

# TimeSeries : check if 'time' column is recognized as datetime
print('IS DATATIME',df['time'].dtype == 'datetime64[ns]')

# TimeSeries : convert index to DatetimeIndex
df.index = pd.to_datetime(df.index)

# TimeSeries : Resample DataFrame to month, week, year..
df=df.resample('M').mean()

# TimeSeries : filter the DataFrame from a specific date
start_date_str = '1950-01-01' # string
start_date = pd.to_datetime(start_date_str)
df = df[df.index >= start_date]

# TimeSeries : Set x-axis ticks every n days, if index. 
tick_spacing = 3650

# TimseSeries : change the date index format to YYYY-MM
df.index.strftime('%Y-%m')

# Timeseries : combine the year and month columns into a new datetime column 'date'
df107['date'] = pd.to_datetime(df107['Year'].astype(str) + '-' + df107['Month'].astype(str))

COLUMNS : RENAME, ADD, DELETE

# Change columns name
df.rename(columns={'old':'new'},inplace = True)
df.rename(columns={df.columns[0]:'new'},inplace=True)
df.columns = ['name'] + df.columns.tolist()[1:]

# Add a column from list
df['New Column'] = my_list

# Add a calculated column
df['New Calculated Column'] = df['value'] / 100

# Add a cumulative sum column
df['Ccumulative Column'] = df['value'].cumsum()

# Add a bins column
df['bins'] = pd.cut(df['Value'], bins = 3)
df['bins'] = pd.cut(df['Value'], bins = 3, labels = [1,2,3])
df['bins'] = pd.cut(df['Value'], bins = [0, 40, 80, 120])
df['bins'] = pd.cut(df['Value'], bins = [0, 40, 80, 120], labels = [1,2,3])

# Delete rows / columns 
df = df.drop('row2', axis=0) 
df = df.drop('colB', axis=1)
df = df.drop(df.columns[[0, 2]], axis=1)
df = df.drop(columns=['B', 'C'])

COLUMNS : WORKING WITH DATA

# Get the name of the nth column
df.columns[n-1]    

# Get the unique values of a column
df['country_name'].unique()

# Get max value of a column
df['value'].max()

# Get the list of values of a column 
df['country_name'].tolist()

# Get percentile : 80% of datapoints
df_sorted = df.sort_values(by='sy_dist')
percentile_80 = df_sorted['value'].quantile(0.8)

# Correlations
correlation = df1['col1'].corr(df2['col2'])

COLUMNS : SELECTION

# Selecting columns 
df = df.loc[:, df.columns.isin(COUNTRIES)] 

# Select data by rows / columns 
df['column label']
df.loc[row1_label]
df.loc[row11_label, column1_label]
df.iloc[0] # row0
df.iloc[[1,3,10]] # rows 1 ,3 ,10
df.iloc[:, 1] # column1
df.iloc[[0, 2], [1, 2]] # subsets rows 0,2 and columns 1,2

FILTERING ROWS

# Filtering rows
df = df[df['value'] == some_value]
df = df[df['value'] > some_value]
df = df[(df['country']=='country1') & (df['Year']=='2022')]
df = df[df['country_name'].isin(listofcountries)]
df = df[df['year'].between(2015,2020)]
df = df[df.index=='2023-01-01']
df = df[df['TIME']>pd.to_datetime('2019')]
df = df[df[0].str.contains('string')]

# Filtering rows with query
# column names with spaces or special characters should be enclosed in backticks (`)
df = df.query("value1 > 5")
df = df.query("value2 == 'a'")
df = df.query("2022 <= year <= 2023")
df = df.query("value3.notna()")
selection = [2022, 2024]
result = df.query("year in @selection")

SORTS

df = df.sort_values(['colA', 'colB'], ascending = (False, True))

GROUP BY

# Group by and keep several columns :
df.groupby(['country_name','country_code'])['value'].sum()

# Group by and agregate
df.groupby('country').agg({'value1': 'mean', 'value2': 'sum'})

# Group by and keep top n values
df.groupby(['country_name'])['value'].sum().sort_values(ascending = False)[0:n]

# Group by and get the list of categories for the top 10 values 
df.groupby('category')['values'].count().sort_values(ascending = False).head().index.tolist()

# Group by : get one line per group
df.groupby("Category").first()
df.groupby("Category").last()
df.groupby("Category").nth()

# Group by : get one group
df_group.get_group('Category')

# Group by : statistical summary
df.groupby("category")[["Quantity"]].describe()

CONCATENATE, UNIONS

# Concatenate :
#  vertical stack and reset index
df3 = pd.concat([df1,df2], axis = 0 ).reset_index()
# horizontal stack
df4 = pd.concat([df1,df2], axis = 1)

# Unions
descriptions = pd.DataFrame({
    'material': ['material1', 'material2', 'material3'],
    'description': ['descr1', 'descr2', 'descr3']})
prices = pd.DataFrame({
        'material': ['material1', 'material3'],
        'price': ['price1', 'price2']  })
case1_result = pd.merge(descriptions, prices, on='material', how='left')
case2_result = pd.merge(prices, descriptions, on='material', how='left')
print(case1_result)
print(case2_result)
Case 1:
    material description   price
0  material1      descr1  price1
1  material2      descr2     NaN
2  material3      descr3  price2
Case 2:
    material   price description
0  material1  price1      descr1
1  material3  price2      descr3

# Union with data expansion
groups = pd.DataFrame({'customer': ['customer1', 'customer2','customer3'],
    'group': ['group1', 'group1', 'group2']})
prices = pd.DataFrame({
    'group': ['group1', 'group1', 'group2', 'group2'],
    'material': ['material1', 'material2', 'material1', 'material4'],
    'price': ['price1', 'price2', 'price3', 'price4']})
final_table = pd.merge(groups, prices, on='group')

CHARTS

# Chart : add a title
ax = df.plot(kind='line')
ax.set_title('My Title',fontweight='bold')

# Chart : remove x label
ax.set_xlabel('')

# Chart : bars 
df.plot(kind="bar",color="orange")

# Chart : stacked bars
df_pivot = df.pivot(index='year', columns='country_name', values='value')
ax = df_pivot.plot.bar(stacked=True, figsize=(10,6))

# Chart : stacked bars for top 10 
df_grouped=df.groupby(['country_name'])['value'].sum()
df_top10 = df_grouped.sort_values(ascending=False).head(10)
df_filtered=df[df['country_name'].isin(df_top10.index)]
df_pivot=df_filtered.pivot(index='year', columns='country_name', values='value')
ax = df_pivot.plot.bar(stacked=True)

# Chart : configure fig with matplotlib 
fig = plt.gcf()

# Chart : after group by ( data object is a series)
df = df.groupby('year')['name'].count()
ax.bar(df.index, df.values, color = 'white')

# Chart : show 
# because df.plot() returns an instance of a Matplotlib AxesSubplot
import matplotlib.pyplot as plt
plt.show()

# Chart : Pareto for a dataframe with values, categories. 
# If there's no category, it can be created with bins using pd.cut(), see above 
# assign dataframe column headers to variables  
category = 'Category' 
value = 'Value'
# evaluate total value
total = df[value].sum()
# group by  
serie_sum_par_cat = df.groupby(category)[value].sum().sort_values(ascending = False)
# series object to dataframe object :  
df_sum_par_cat = serie_sum_par_cat.reset_index()
# add cumulative sum column
df_sum_par_cat['cumul'] = df_sum_par_cat[value].cumsum() / total * 100
print(df_sum_par_cat)
# chart
fig, ax1 = plt.subplots()
ax1.bar(df_sum_par_cat[category],df_sum_par_cat[value])
ax1.set_ylabel(('value label'))
ax1.set_xlabel('by category')
ax2 = ax1.twinx()
ax2.plot(df_sum_par_cat[category],df_sum_par_cat['cumul'],color='red')
ax2.set_ylabel('Cumul % of'+ value)
plt.show()

# Chart : boxplot / box and whiskers - one box a year
grouped = df.groupby('year')
sorted_years = sorted(df['year'].unique())
data_per_year = [grouped.get_group(year)['value'] for year in sorted_years]
bplot = ax.boxplot(data_per_year,
                    vert = True,  # 
                    patch_artist = True,  # 
                    labels = sorted_years)
for patch in bplot['boxes']:
	patch.set_facecolor('white')
for whisker in bplot['whiskers']:
    whisker.set(color = 'white')
for flier in bplot['fliers']:
    flier.set(marker = 'o', color='white', markerfacecolor = 'white')
for cap in bplot['caps']:
    cap.set(color = 'white')
plt.xticks(rotation=45)
plt.show()

SAVE

# Save as excel, csv, txt
df.to_excel(filename.xlsx,sheet_name='name',index=False)
df.to_csv(filename.csv,index=False)
df.to_csv(filename.txt,sep=';',index=False)

APPLY FUNCTIONS AND OPERATIONS

# Apply functions with .pipe()
def square_values(df, column_name):
	df[column_name] = df[column_name] ** 2
        return df
def divide_values(df, column_name):
	df[column_name] = df[column_name] / 2
        return df
result = (df
	.pipe(divide_values, 'values1')   	
	.pipe(square_values, 'values1')
          ) 

# Operations
df['values1'] = df['values1'] / 2


Example #1. Reading an excel crosstab table and reshaping in a crosstab table :

# initial index is 1st column
df= pd.read_excel(datasetfile, sheet_name='Table 9',
    skiprows=skiprows,index_col=0) 
# transpose data :
df = df.transpose() 
# reset index :
df = df.reset_index(drop=False) 
# set a new index
df = df.set_index(df.columns[0]) 
# assignes a name or header to the index :
df = df.rename_axis('year') 
# keep only the columns whose headers are present in a predefined list of countries
df = df.loc[:, df.columns.isin(COUNTRIES)] 
print(df.head().iloc[:, :5])

Example #2: Unpivoting


    From Pivot / Crosstab :
            Customer1  Customer2
    2023       5000       2000
    2022       4000       3000

    To Long Format / Tidy Format:
    Year     Customer    Value
    0  2023  Customer1   5000
    1  2022  Customer1   4000
    2  2023  Customer2   2000
    3  2022  Customer2   3000    

# Create Crosstab dataframe
data = {'Customer1': [5000, 4000], 'Customer2': [2000, 3000]}
df = pd.DataFrame(data, index=[2023, 2022])
# melt
melted_df = df.reset_index().melt(id_vars='index', var_name='Customer', value_name='Value')
# Rename
melted_df.columns = ['Year', 'Customer', 'Value']

Example #3 : exploring groupby object

Understand its structure without performing common operations like sum and mean. In this script, 'df' is a DataFrame with one row per product, four categories labeled as A, B, C, and D, and a 'values' column. This script illustrates that the GroupBy object serves as a container holding a tuple for each category. Within this tuple, you'll find a DataFrame and the category name.

grouped = df.groupby(['Category'])
print(type(grouped)) # prints : 'pandas.core.groupby.generic.DataFrameGroupBy'
print(len(grouped)) # prints : 4
for group in grouped:
    print(type(group), type(group[0]), type(group[1]) )
# prints 4 times :
# class 'tuple' class 'str' class 'pandas.core.frame.DataFrame'