Pandas
August 26, 2023About 6 min
pandas - PANel DAta Structures
A clean axis indexing design to support fast data alignment, lookups,
hierarchical indexing, and more high-performance data structures.
SQL-like functionality: GroupBy, joining/merging, etc. Missing data
handling.
import pandas as pd
Data structures: series, dataframe, panel
Series
Basic
obj = pd.Series([4, 7, -5, 3])
obj
obj.values
obj.index
# You can specify the desired indices
obj2 = pd.Series([4, 7, -5, 3],index=['d', 'b', 'a', 'c'])
obj2
Selecting values
obj2['a']
obj2[2]
obj2[['a', 'b', 'c']]
obj2[obj2 > 0]
obj2['d'] = 99
obj2 * 2
'b' in obj2 # series works like fixed-length ordered dictionary
sdict = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdict)
sdict = {'Oregon': 16000, 'Ohio': 35000, 'Texas': 71000, 'California': 'NaN'}
obj4 = pd.Series(sdict)
obj3 + obj4
DataFrame
Basic
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data) # from dictionary
frame = pd.DataFrame(list(range(10)), columns=['a']) # from list
frame = pd.DataFrame(obj3) # from series
frame = pd.DataFrame(np.random.randn(5,3)) # from np.array
frame2 = pd.DataFrame(data, columns=['year','state','pop','debt'],
index=['one', 'two', 'three', 'four', 'five'])
frame2.columns
frame2.index
data = {'Nevada': {2001: 2.4, 2002: 2.9}, # from nested dictionary
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame = pd.DataFrame(data)
frame.index.name = 'year'
frame.columns.name = 'state'
frame.values
Data retrival
# by columns
frame2['state']
frame2.year
# by rows
frame2.ix['three']
frame2.ix[2]
frame2.loc['three']
frame2.iloc[2]
frame2.iloc[:, 1]
Reassigning values
frame2['debt'] = np.arange(5.)
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2['eastern'] = frame2.state == 'Ohio'
del frame2['eastern']
frame2.columns
Dropping entries
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
frame2
frame3 = frame2.drop('three', axis = 0)
frame3 = frame2.drop('year', axis = 1)
frame3 = frame2.drop(frame2.index[2], axis=0)
frame3 = frame2.drop(frame2.columns[2], axis=1)
Merging data (join)
basic
left_frame = pd.DataFrame({'key': range(5),
'left_value': ['a', 'b', 'c', 'd', 'e']})
right_frame = pd.DataFrame({'key': range(2, 7),
'right_value': ['f', 'g', 'h', 'i', 'j']})
left_frame
right_frame
Merging
# inner join
pd.merge(left_frame, right_frame, on='key', how='inner')
# left outer join
pd.merge(left_frame, right_frame, on='key', how='left')
# right outer join
pd.merge(left_frame, right_frame, on='key', how='right')
# full outer join
pd.merge(left_frame, right_frame, on='key', how='outer')
concatenate
# Combining data - adding rows
pd.concat([left_frame, right_frame])
# Combining data - adding columns
pd.concat([left_frame, right_frame], axis=1)
left_frame = left_frame.drop(1)
pd.concat([left_frame, right_frame], axis=1)
Groupping data (groupby)
basic
import pandas as pd
import numpy as np
###########################################################
# Check (and set) working directory to read data file
# Read in the data and check it out
mtcars = pd.read_csv("mtcars.csv")
mtcars.head()
mtcars.shape
aggregated
# Compute basic descriptive stats over data
mtcars.describe()
mtcars.mean() # also compute median, std, var, min, max, quantile
mtcars.median()
mtcars.mean(axis=1) # compute row means (ie across columns)
# How many automatic transmission cars are there?
mtcars[mtcars["am"]==0].shape
mtcars[mtcars["am"]==0]
# Plot a histogram
mtcars["mpg"].hist()
# Group by number of cylinders and describe
grouped_by_cyl = mtcars.groupby("cyl")
grouped_by_cyl.mean()
# can group by more than one category
grouped_by_cyl_am = mtcars.groupby(["cyl","am"])
# compute statistics aggregated over groupings
grouped_by_cyl_am.agg([np.mean, np.std])
# count the number of cars in each combination of cyl and am
counts = grouped_by_cyl_am['cyl'].count()
# plot the counts
df = counts.unstack()
ax = df.plot(kind='bar', stacked=True, figsize=(10, 5), colormap="BuGn")
ax.set_ylabel("Count")
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, loc='best')
Arithmetic between DataFrames and Series
arr = np.arange(12.).reshape((4, 3))
arr - arr[0]
# broadcasting down the rows
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
frame - series
frame.columns = ['e', 'd', 'b']
frame - series
# broadcasting down the columns
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame['d']
frame.sub(series, axis=0)
series = frame.d[['Texas', 'Ohio', 'Oregon', 'Utah']]
frame.sub(series, axis=0)
Reindexing
Basic
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj.index
obj.index[1] = 'd' # immutable
obj.index = ['a', 'd', 'c']
#import warnings
#warnings.simplefilter(action = "ignore", category = RuntimeWarning)
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
With new values
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)
obj = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj
obj.reindex(range(6), method='ffill')
obj.reindex(range(6), method='bfill')
More
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
index=['a', 'c', 'd'],
columns=['Ohio', 'Texas', 'California'])
# reindex rows
frame.reindex(['a', 'b', 'c', 'd'])
# reindex columns
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)
# reindex rows and columns
frame.reindex(index=['a', 'b', 'c', 'd'], columns=states)
# alternative way to reindex
frame.ix[['a', 'b', 'c', 'd'], states]
Extra for indexing and slicing
obj = pd.Series(np.arange(4.),index=['a', 'b', 'c', 'd'])
obj[1] # works like in numpy
obj['b']
obj[1:2] # works like in Python
obj['b':'c'] # takes end-point like in R
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data[[1]]
data['two']
data[['two','four']]
data[:2]
data[data['three'] > 5]
data[data < 5] = 0
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data.ix['Colorado', ['two', 'three']]
data.ix[['Colorado','Utah'],['two', 'three']]
data.ix[2]
data.ix[:'Utah', 'two']
data.ix[data.three > 5,:3]
Hierarchical Indexing
'''
Enables multiple (two or more) index levels on an axis.
Provides a way to work with higher dimensional data at lower
dimensions.
'''
# Series
data = pd.Series(np.random.randn(10),
index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
[ 1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data['b']
data['b':'c']
data.ix[['b', 'c']]
data[:,2]
data.unstack()
data.unstack().stack()
# DataFrames
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
index=[['a','a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'colour']
frame['Ohio']
frame.stack()
frame.stack(level=0)
frame.stack().stack()
frame.unstack().unstack()
frame.unstack(level=[0,1])
Function mapping
apply - vectorisation
frame = pd.DataFrame(np.random.randn(4, 3),
columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
f = lambda x: x.max() - x.min()
frame.apply(f)
def f(x):
return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
# applymap - element-wise application
format = lambda x: '%.2f' % x # returns a formatted string
frame.applymap(format)
Missing data
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()
data[data.notnull()]
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj = pd.Series(sdata, index=states)
pd.isnull(obj)
pd.notnull(obj)
obj.fillna(0)
obj.fillna(0, inplace=True)
obj.fillna(obj.mean())
df = pd.DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA
df
cleaned = df.dropna()
cleaned
cleaned2 = df.dropna(thresh=2)
cleaned2
Add/Delete/Query/Mofify
Add
Add Column
# method 1
citys = ['ny','zz','xy']
df.insert(0,'city',citys) #add column city to the first position, values are specified wity cities
# method 2
jobs = ['student','AI','teacher']
df['job'] = jobs # Add column job to the last position, values are specified with jobs
# method 3
df.loc[:,'salary'] = ['1k','2k','2k','2k','3k'] #Add column salary to the last postion
Add Rows
# add a new row with index equals 4 if index 4 does not exists
# update the data with index equals 4 if index 4 exists
df.loc[4] = ['zz','mason','m',24,'engineer’]
# construct a new dataframe
df_insert = pd.DataFrame({'name':['mason','mario'],'sex':['m','f'],'age':[21,22]},index = [4,5])
# get a new dataframe, do not modify df
# The meaning of ignore_index
# False: the result using the index from df insert
# True: the result doesn't using the index from df insert, and will generate a new index
ndf = df.append(df_insert,ignore_index = True)
Delete
delete row
df.drop([1,3],axis = 0,inplace = False)#delete index=1, 3 two rows
delete column
df.drop(['name'],axis = 1,inplace = False) #delete column column
del df['name'] #delete name column
ndf = df.pop('age') #delete age column
Query
[]
df['name'] # select the column 'name'
df['gender'] # select the column 'gender'
df[['name','gender']] #select multi columns
df[0:] # select data from the row 0 to the end
df[:2] # select data from the begining to the row 2, (row 2 is not included)
df[0:1] # select only row 0
df[1:3] # select row 1 and row 2
df[-1:] # select the last row
df[-3:-1] # select the last 3 column to the last 1 column, the last 1 column is not included.
loc[]
# select the specified row and columns
df.loc[0,'name'] # select the cell are specified with row=0, and column='name'
df.loc[0:2, ['name','age']] #select data are specfied with the range row=[0, 2), column=['name', 'age']
df.loc[[2,3],['name','age']] #select the row 2 and row 3, columns are specified with 'name' and 'age'
df.loc[df['gender']=='M','name'] #select column name with rows gender='M'
df.loc[df['gender']=='M',['name','age']] #select column name, age with rows gender='M'
iloc[]
df.iloc[0,0] #Select row=0, column=0
df.iloc[1,2] #Select row=1, column=2
df.iloc[[1,3],0:2] #select row=1,3, column from 0 to 2 (2 is excluded)
df.iloc[1:3,[1,2] #select row from 1 to 3 (3 is excluded), column=1,2
Mofify
loc[]
df.loc[1,'name'] = 'aa' #modify cell value
df.loc[1] = ['bb','ff',11] #modify row
df.loc[1,['name','age']] = ['bb',11] #modify some values
iloc[]
df.iloc[1,2] = 19 #
df.iloc[:,2] = [11,22,33] #
df.iloc[0,:] = ['lily','F',15] #
Sort Value
df.sort_values(by='cost', ascending=False)